[PATCH] improve ifcvt optimization (PR rtl-optimization/89430)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
@ 2019-03-15  8:22 JiangNing OS
  2019-03-18 21:53 ` Jeff Law
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: JiangNing OS @ 2019-03-15  8:22 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 571 bytes --]

This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below,

if (...)
    x = a;  /* x is memory */
/* no else */

We can generate conditional move and remove the branch as below if the target cost is acceptable. 

r1 = x
r2 = a
cmp ...
csel r3, r1, r2, cond
x = r3

This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. 

In practice, this optimization can improve a real application performance by %4 on aarch64.

Thanks,
-Jiangning

[-- Attachment #2: csel3.patch --]
[-- Type: application/octet-stream, Size: 18938 bytes --]

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 269698)
+++ ChangeLog	(working copy)
@@ -1,3 +1,25 @@
+2019-03-15  Jiangning Liu  <jiangning@os.amperecomputing.com>
+
+	PR rtl-optimization/89430
+	* cse.c (fixed_base_plus_p): Move from here...
+	* rtlanal.c (fixed_base_plus_p): ...to here...
+	* rtl.h (fixed_base_plus_p): Add extern declaration.
+	* ifcvt.c
+	(noce_process_if_block): For no else_bb and memory store ifcvt case,
+	early exit if only it is global and current function has address
+	taken.
+	(noce_try_cmove_arith): Do ifcvt for no else_bb case, if the variable
+	is on stack and dominating access can prove the newly introduce memory
+	access is safe.
+	(find_all_must_be_sfp_insns): New function.
+	(find_all_may_be_sfp_insns): New function.
+	(no_stack_address_taken): New function.
+	(noce_process_if_block): New function.
+	(noce_mem_is_on_stack): New function.
+	(noce_valid_for_dominating): New function.
+	* ifcvt.h : Add new fields.
+	* testsuite/gcc.dg/ifcvt-6.c: New.
+
 2019-03-14  Jason Merrill  <jason@redhat.com>
 	    Jakub Jelinek  <jakub@redhat.com>
 
Index: cse.c
===================================================================
--- cse.c	(revision 269698)
+++ cse.c	(working copy)
@@ -540,7 +540,6 @@
    already as part of an already processed extended basic block.  */
 static sbitmap cse_visited_basic_blocks;
 
-static bool fixed_base_plus_p (rtx x);
 static int notreg_cost (rtx, machine_mode, enum rtx_code, int);
 static int preferable (int, int, int, int);
 static void new_basic_block (void);
@@ -606,30 +605,7 @@
 
 static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
 \f
-/* Nonzero if X has the form (PLUS frame-pointer integer).  */
 
-static bool
-fixed_base_plus_p (rtx x)
-{
-  switch (GET_CODE (x))
-    {
-    case REG:
-      if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx)
-	return true;
-      if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM])
-	return true;
-      return false;
-
-    case PLUS:
-      if (!CONST_INT_P (XEXP (x, 1)))
-	return false;
-      return fixed_base_plus_p (XEXP (x, 0));
-
-    default:
-      return false;
-    }
-}
-
 /* Dump the expressions in the equivalence class indicated by CLASSP.
    This function is used only for debugging.  */
 DEBUG_FUNCTION void
Index: ifcvt.c
===================================================================
--- ifcvt.c	(revision 269698)
+++ ifcvt.c	(working copy)
@@ -76,6 +76,15 @@
 /* Whether conditional execution changes were made.  */
 static int cond_exec_changed_p;
 
+/* REGNO bitmap for defs that may be stack frame address. */
+static bitmap bba_sets_may_be_sfp;
+
+/* REGNO bitmap for defs that must be stack frame address. */
+static bitmap bba_sets_must_be_sfp;
+
+/* true if current function doesn't have local address taken. */
+static bool cfun_no_stack_address_taken;
+
 /* Forward references.  */
 static int count_bb_insns (const_basic_block);
 static bool cheap_bb_rtx_cost_p (const_basic_block, profile_probability, int);
@@ -99,6 +108,13 @@
 			       edge, int);
 static void noce_emit_move_insn (rtx, rtx);
 static rtx_insn *block_has_only_trap (basic_block);
+static bool noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x);
+static bool noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
+                                       const_rtx x, bool is_store);
+static bool noce_mem_maybe_invalid_p (struct noce_if_info *if_info);
+static void find_all_must_be_sfp_insns (void);
+static void find_all_may_be_sfp_insns (void);
+static bool no_stack_address_taken (void);
 \f
 /* Count the number of non-jump active insns in BB.  */
 
@@ -2029,6 +2045,93 @@
   return true;
 }
 
+/* Return true if MEM x is on stack. a_insn contains x, if it exists. */
+
+static bool
+noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x)
+{
+  df_ref use;
+
+  gcc_assert (x);
+  gcc_assert (MEM_P (x));
+
+  /* Early exits if find base is a stack register. */
+  rtx a = XEXP (x, 0);
+  if (fixed_base_plus_p(a))
+    return true;
+
+  if (!a_insn)
+    return false;
+
+  if (!reg_mentioned_p (x, a_insn))
+    return false;
+
+  /* Check if x is on stack. Assume a mem expression using registers
+     related to stack register is always on stack. */
+  FOR_EACH_INSN_USE (use, a_insn)
+    if (reg_mentioned_p (DF_REF_REG (use), x)
+        && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
+      return true;
+
+  return false;
+}
+
+/* Always return true, if there is a dominating write.
+   
+   When there is a dominating read from memory on stack,
+   1) if x = a is a memory read, return true.
+   2) if x = a is a memory write, return true if the memory is on stack.
+      This is the guarantee the memory is *not* readonly. */
+
+static bool
+noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
+                           const_rtx x, bool is_store)
+{
+  rtx_insn *insn;
+  rtx set;
+
+  gcc_assert (MEM_P (x));
+
+  FOR_BB_INSNS (bb, insn)
+    {
+      set = single_set (insn);
+      if (!set)
+        continue;
+
+      /* Dominating store */
+      if (rtx_equal_p (x, SET_DEST (set)))
+        return true;
+
+      /* Dominating load */
+      if (rtx_equal_p (x, SET_SRC (set)))
+        if (is_store && noce_mem_is_on_stack (a_insn, x))
+          return true;
+    }
+
+  return false;
+}
+
+/* Return false if the memory a or b must be valid.
+   This function must be called before latent swap of a and b. */
+
+static bool
+noce_mem_maybe_invalid_p (struct noce_if_info *if_info)
+{
+  if (!if_info->set_b && MEM_P(if_info->orig_x))
+    {
+      if (!if_info->else_bb && MEM_P (if_info->b))
+        return !noce_valid_for_dominating (if_info->test_bb,
+                                           if_info->insn_a,
+                                           if_info->b, true);
+    }
+
+  /* ??? We could handle this if we knew that a load from A or B could
+     not trap or fault.  This is also true if we've already loaded
+     from the address along the path from ENTRY.  */
+  return (may_trap_or_fault_p (if_info->a)
+          || may_trap_or_fault_p (if_info->b));
+}
+
 /* Try more complex cases involving conditional_move.  */
 
 static int
@@ -2065,10 +2168,7 @@
       is_mem = 1;
     }
 
-  /* ??? We could handle this if we knew that a load from A or B could
-     not trap or fault.  This is also true if we've already loaded
-     from the address along the path from ENTRY.  */
-  else if (may_trap_or_fault_p (a) || may_trap_or_fault_p (b))
+  else if (noce_mem_maybe_invalid_p (if_info))
     return FALSE;
 
   /* if (test) x = a + b; else x = c - d;
@@ -2234,7 +2334,7 @@
   /* If insn to set up A clobbers any registers B depends on, try to
      swap insn that sets up A with the one that sets up B.  If even
      that doesn't help, punt.  */
-  if (modified_in_a && !modified_in_b)
+  if (!modified_in_a && modified_in_b)
     {
       if (!noce_emit_bb (emit_b, else_bb, b_simple))
 	goto end_seq_and_fail;
@@ -2242,7 +2342,7 @@
       if (!noce_emit_bb (emit_a, then_bb, a_simple))
 	goto end_seq_and_fail;
     }
-  else if (!modified_in_a)
+  else if (!modified_in_b)
     {
       if (!noce_emit_bb (emit_a, then_bb, a_simple))
 	goto end_seq_and_fail;
@@ -3563,13 +3663,28 @@
     }
 
   if (!set_b && MEM_P (orig_x))
-    /* We want to avoid store speculation to avoid cases like
-	 if (pthread_mutex_trylock(mutex))
-	   ++global_variable;
-       Rather than go to much effort here, we rely on the SSA optimizers,
-       which do a good enough job these days.  */
-    return FALSE;
+    {
+      /* We want to avoid store speculation to avoid cases like
+           if (pthread_mutex_trylock(mutex))
+             ++global_variable;  
+         Tree if conversion cannot handle this case well, and it intends to
+         help vectorization for loops only. */
+      if (!noce_mem_is_on_stack(insn_a, orig_x))
+        return FALSE;
 
+      /* For case like,
+           if (pthread_mutex_trylock(mutex))
+             ++local_variable;
+         If any stack variable address is taken, potentially this local
+         variable could be modified by other threads and introduce store
+         speculation. */
+      if (!cfun_no_stack_address_taken)
+        return FALSE;
+    }
+
+  if_info->set_a = set_a;
+  if_info->set_b = set_b;
+
   if (noce_try_move (if_info))
     goto success;
   if (noce_try_ifelse_collapse (if_info))
@@ -5347,6 +5462,234 @@
 
   return FALSE;
 }
+
+
+/* Find all insns that must be stack address and store REGNO into
+   bitmap bba_sets_must_be_sfp. */
+
+static void
+find_all_must_be_sfp_insns (void)
+{
+  rtx_insn *a_insn;
+  basic_block bb;
+  bool sfp_found;
+  auto_vec<int, 1> def_count;
+  df_ref def, use;
+
+  /* Calculate def counts for each insn. */
+  def_count.safe_grow_cleared (max_reg_num () + 1);
+  FOR_ALL_BB_FN (bb, cfun)
+    FOR_BB_INSNS (bb, a_insn)
+      {
+        if (!INSN_P (a_insn))
+          continue;
+
+        FOR_EACH_INSN_DEF (def, a_insn)
+          def_count[DF_REF_REGNO (def)]++;
+      }
+
+  /* Iterate all insns until finding all stack pointers, and store
+     them into bba_sets_must_be_sfp. */
+  bba_sets_must_be_sfp = BITMAP_ALLOC (&reg_obstack);
+  do
+    {
+      sfp_found = false;
+      FOR_ALL_BB_FN (bb, cfun)
+        FOR_BB_INSNS (bb, a_insn)
+          {
+            rtx sset_a = single_set (a_insn);
+            if (!sset_a)
+              continue;
+            rtx src = SET_SRC (sset_a);
+            rtx dest = SET_DEST (sset_a);
+
+            if (!REG_P (dest))
+              continue;
+
+            /* For the case below,
+                 Control flow: B1->B3, B2->B3
+                 B1: p = &local_var
+                 B2: p = &global_var
+                 B3: ... = *p
+               pointer p is an address for either local or global variable.
+               so we don't treat p as a stack address pointer. To make
+               algorithm simple, we ignore all non-SSA cases. */
+            bool skip_insn = false;
+            unsigned int dest_regno = 0;
+            FOR_EACH_INSN_DEF (def, a_insn)
+              {
+                dest_regno = DF_REF_REGNO (def);
+                /* Skip current insn if
+                   1) already marked as stack pointer.
+                   2) or see multiple definition points. */
+                if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno)
+                    || def_count[dest_regno] > 1)
+                  {
+                    skip_insn = true;
+                    break;
+                  }
+              }
+            if (skip_insn)
+              continue;
+
+            /* Handle case like "x1 = sp + offset". */
+            if (fixed_base_plus_p (src))
+              {
+  	        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
+                sfp_found = true;
+                continue;
+              }
+
+            /* Handle case like "x2 = x1 + offset", in which x1 is already
+               identified as a stack pointer. */
+            if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
+                && CONST_INT_P (XEXP (src, 1)))
+              {
+                rtx x1 = XEXP (src, 0);
+                if (!REG_P (x1))
+                  continue;
+
+                FOR_EACH_INSN_USE (use, a_insn)
+                  {
+                    if (!rtx_equal_p (x1, DF_REF_REG (use)))
+                      continue;
+
+                    if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
+                      {
+                        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
+                        sfp_found = true;
+                        break;
+                      }
+                  }
+  	      }
+          }
+    }
+  while (sfp_found);
+}
+
+/* Find all insns that may be stack pointer and store REGNO into
+   bitmap bba_sets_may_be_sfp. We iterate all insns in current func
+   until no more latent insns generating stack address are found.
+   The collection of stack pointer bba_sets_may_be_sfp will be used
+   to help analyze local stack variable address taken. Stack variable
+   address can be passed out of current frame if only any stack pointer
+   is passed to hard register or stored into memory. */
+
+static void
+find_all_may_be_sfp_insns (void)
+{
+  rtx_insn *a_insn;
+  basic_block bb;
+  bool sfp_found;
+  bba_sets_may_be_sfp = BITMAP_ALLOC (&reg_obstack);
+
+  /* Iterate all insns that may be stack pointers, and store them into
+     bitmap bba_sets_must_be_sfp. */
+  do
+    {
+      sfp_found = false;
+      FOR_ALL_BB_FN (bb, cfun)
+        FOR_BB_INSNS (bb, a_insn)
+          {
+            /* Assume stack pointers can only be generated by single SET. */
+            rtx sset_a = single_set (a_insn);
+            if (!sset_a)
+              continue;
+            rtx src = SET_SRC (sset_a);
+            rtx dest = SET_DEST (sset_a);
+
+            /* If we see "hard_register = ...", which implies passing out
+               of current frame potentially, we don't collect this case.
+               It can be treated as address taken in no_stack_address_taken */
+            if (HARD_REGISTER_P (dest))
+              continue;
+
+            /* Memory load and store are not to generate stack pointer. */
+            if (MEM_P (src) || MEM_P (dest))
+              continue;
+
+            /* Skip insn that is already identified as stack pointer. */
+            bool skip_insn = false;
+            df_ref def;
+            unsigned int dest_regno = 0;
+            FOR_EACH_INSN_DEF (def, a_insn)
+              {
+                dest_regno = DF_REF_REGNO (def);
+                if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno))
+                  {
+                    skip_insn = true;
+                    break;
+                  }
+              }
+            if (skip_insn)
+              continue;
+
+            /* Collect all latent stack pointer insns. */
+            df_ref use;
+            FOR_EACH_INSN_USE (use, a_insn)
+              {
+                rtx x = DF_REF_REG (use);
+                if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx
+                    || bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
+                  {
+                    bitmap_set_bit (bba_sets_may_be_sfp, dest_regno);
+                    sfp_found = true;
+                    break;
+                  }
+              }
+          }
+    }
+  while (sfp_found);
+}
+
+/* Return true if current function doesn't pass stack address out of frame. */
+
+static bool
+no_stack_address_taken (void)
+{
+  basic_block bb;
+  rtx_insn *a_insn;
+  df_ref use;
+
+  FOR_ALL_BB_FN (bb, cfun)
+    FOR_BB_INSNS (bb, a_insn)
+      {
+        if (!INSN_P (a_insn))
+          continue;
+
+        rtx sset_a = single_set (a_insn);
+        if (!sset_a)
+          continue;
+        rtx src = SET_SRC (sset_a);
+        rtx dest = SET_DEST (sset_a);
+
+        /* Skip insn that is already identified as frame pointers. */
+        if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
+          continue;
+                    
+        FOR_EACH_INSN_USE (use, a_insn)
+          {
+            /* Check if current insn is using any stack pointer. Ignore
+               if it doesn't use frame pointers at all. */
+            if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
+              continue;
+
+            /* Load cannot introduce address taken. */
+            if (MEM_P (src))
+              continue;
+
+            /* Store is safe if only the src doesn't contain stack pointer. */
+            if (MEM_P (dest)
+                && !reg_mentioned_p (DF_REF_REG (use), src))
+              continue;
+
+            /* All other cases potentially introduce address taken. */
+            return false;
+          }
+      }
+  return true;
+}
+
 \f
 /* Main entry point for all if-conversion.  AFTER_COMBINE is true if
    we are after combine pass.  */
@@ -5381,6 +5724,11 @@
 
   df_set_flags (DF_LR_RUN_DCE);
 
+  /* Prepare for stack variable anslysis. */
+  find_all_must_be_sfp_insns ();
+  find_all_may_be_sfp_insns ();
+  cfun_no_stack_address_taken = no_stack_address_taken ();
+
   /* Go through each of the basic blocks looking for things to convert.  If we
      have conditional execution, we make multiple passes to allow us to handle
      IF-THEN{-ELSE} blocks within other IF-THEN{-ELSE} blocks.  */
@@ -5413,6 +5761,9 @@
     }
   while (cond_exec_changed_p);
 
+  BITMAP_FREE (bba_sets_may_be_sfp);
+  BITMAP_FREE (bba_sets_must_be_sfp);
+
 #ifdef IFCVT_MULTIPLE_DUMPS
   if (dump_file)
     fprintf (dump_file, "\n\n========== no more changes\n");
Index: ifcvt.h
===================================================================
--- ifcvt.h	(revision 269698)
+++ ifcvt.h	(working copy)
@@ -73,6 +73,8 @@
 
   /* The SET_SRC of INSN_A and INSN_B.  */
   rtx a, b;
+  /* The SET of INSN_A and INSN_B.  */
+  rtx set_a, set_b;
 
   /* The SET_DEST of INSN_A.  */
   rtx x;
Index: rtl.h
===================================================================
--- rtl.h	(revision 269698)
+++ rtl.h	(working copy)
@@ -3751,6 +3751,8 @@
 #define hard_frame_pointer_rtx	(global_rtl[GR_HARD_FRAME_POINTER])
 #define arg_pointer_rtx		(global_rtl[GR_ARG_POINTER])
 
+extern bool fixed_base_plus_p (rtx x);
+
 #ifndef GENERATOR_FILE
 /* Return the attributes of a MEM rtx.  */
 static inline const struct mem_attrs *
Index: rtlanal.c
===================================================================
--- rtlanal.c	(revision 269698)
+++ rtlanal.c	(working copy)
@@ -341,6 +341,30 @@
   return 0;
 }
 
+/* Nonzero if X has the form (PLUS frame-pointer integer).  */
+
+bool
+fixed_base_plus_p (rtx x)
+{
+  switch (GET_CODE (x))
+    {
+    case REG:
+      if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx)
+        return true;
+      if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM])
+        return true;
+      return false;
+
+    case PLUS:
+      if (!CONST_INT_P (XEXP (x, 1)))
+        return false;
+      return fixed_base_plus_p (XEXP (x, 0));
+
+    default:
+      return false;
+    }
+}
+
 /* Compute an approximation for the offset between the register
    FROM and TO for the current function, as it was at the start
    of the routine.  */
Index: testsuite/gcc.dg/ifcvt-6.c
===================================================================
--- testsuite/gcc.dg/ifcvt-6.c	(revision 0)
+++ testsuite/gcc.dg/ifcvt-6.c	(working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-fdump-rtl-ce1 -O2" } */
+
+unsigned test_ifcvt (unsigned k, unsigned b) {
+        unsigned a[2];
+        if (b < a[k]) {
+                a[k] = b;
+        }
+        return a[0]+a[1];
+}
+
+/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_try_cmove_arith" "ce1" } } */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-03-15  8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS
@ 2019-03-18 21:53 ` Jeff Law
  2019-04-30 16:00 ` Jeff Law
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Jeff Law @ 2019-03-18 21:53 UTC (permalink / raw)
  To: JiangNing OS, gcc-patches

On 3/14/19 10:46 PM, JiangNing OS wrote:
> This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below,
> 
> if (...)
>     x = a;  /* x is memory */
> /* no else */
> 
> We can generate conditional move and remove the branch as below if the target cost is acceptable. 
> 
> r1 = x
> r2 = a
> cmp ...
> csel r3, r1, r2, cond
> x = r3
> 
> This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. 
> 
> In practice, this optimization can improve a real application performance by %4 on aarch64.
This seems like something that should be addressed for gcc-10 rather
than gcc-9.  Can we come back to it once gcc-10 development starts?

jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-03-15  8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS
  2019-03-18 21:53 ` Jeff Law
@ 2019-04-30 16:00 ` Jeff Law
  2019-05-05  0:11   ` JiangNing OS
  2019-05-24 14:54 ` Kyrill Tkachov
  2019-06-17 21:59 ` Jeff Law
  3 siblings, 1 reply; 15+ messages in thread
From: Jeff Law @ 2019-04-30 16:00 UTC (permalink / raw)
  To: JiangNing OS, gcc-patches

On 3/14/19 10:46 PM, JiangNing OS wrote:
> This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below,
> 
> if (...)
>     x = a;  /* x is memory */
> /* no else */
> 
> We can generate conditional move and remove the branch as below if the target cost is acceptable. 
> 
> r1 = x
> r2 = a
> cmp ...
> csel r3, r1, r2, cond
> x = r3
> 
> This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. 
> 
> In practice, this optimization can improve a real application performance by %4 on aarch64.
[ ... ]
I notice that a couple weeks you'd sent a similar patch under the
heading "Fixing ifcvt issue as exposed by BZ89430".  Does this patch
supersede the earlier patch?

Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-04-30 16:00 ` Jeff Law
@ 2019-05-05  0:11   ` JiangNing OS
  0 siblings, 0 replies; 15+ messages in thread
From: JiangNing OS @ 2019-05-05  0:11 UTC (permalink / raw)
  To: Jeff Law, gcc-patches

Hi Jeff,

Yes. The latter one "[PATCH] improve ifcvt optimization (PR rtl-optimization/89430)" supersedes the earlier one " Fixing ifcvt issue as exposed by BZ89430".

Thanks,
-Jiangning

-----Original Message-----
From: Jeff Law <law@redhat.com> 
Sent: Tuesday, April 30, 2019 11:54 PM
To: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)

On 3/14/19 10:46 PM, JiangNing OS wrote:
> This patch is to fix a missing ifcvt opportunity in back-end. For the 
> simple case below,
> 
> if (...)
>     x = a;  /* x is memory */
> /* no else */
> 
> We can generate conditional move and remove the branch as below if the target cost is acceptable. 
> 
> r1 = x
> r2 = a
> cmp ...
> csel r3, r1, r2, cond
> x = r3
> 
> This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. 
> 
> In practice, this optimization can improve a real application performance by %4 on aarch64.
[ ... ]
I notice that a couple weeks you'd sent a similar patch under the heading "Fixing ifcvt issue as exposed by BZ89430".  Does this patch supersede the earlier patch?

Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-03-15  8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS
  2019-03-18 21:53 ` Jeff Law
  2019-04-30 16:00 ` Jeff Law
@ 2019-05-24 14:54 ` Kyrill Tkachov
  2019-06-17  0:36   ` JiangNing OS
  2019-06-17 21:59 ` Jeff Law
  3 siblings, 1 reply; 15+ messages in thread
From: Kyrill Tkachov @ 2019-05-24 14:54 UTC (permalink / raw)
  To: JiangNing OS, gcc-patches; +Cc: ebotcazou, steven, law

Hi Jiangning

On 3/15/19 4:46 AM, JiangNing OS wrote:
> This patch is to fix a missing ifcvt opportunity in back-end. For the 
> simple case below,
>
> if (...)
> Â Â Â  x = a;Â  /* x is memory */
> /* no else */
>
> We can generate conditional move and remove the branch as below if the 
> target cost is acceptable.
>
> r1 = x
> r2 = a
> cmp ...
> csel r3, r1, r2, cond
> x = r3
>
> This could be safe if x is a stack variable, and there isn't any 
> address taken in current function, so the store speculation can be 
> avoided.
>
> In practice, this optimization can improve a real application 
> performance by %4 on aarch64.
>

Now that GCC 10 development is open, this should appropriate for 
considering.

I've cc'ed folks who are either listed maintainers in this area or have 
reviewed patches in this area in my recent memory.

Thanks,

Kyrill


> Thanks,
> -Jiangning

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-05-24 14:54 ` Kyrill Tkachov
@ 2019-06-17  0:36   ` JiangNing OS
  0 siblings, 0 replies; 15+ messages in thread
From: JiangNing OS @ 2019-06-17  0:36 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches; +Cc: ebotcazou, steven, law

Hi,

May I know any feedback comments to this patch?

Thanks,
-Jiangning

From: Kyrill Tkachov <kyrylo.tkachov@foss.arm.com>
Sent: Friday, May 24, 2019 10:55 PM
To: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-patches@gcc.gnu.org
Cc: ebotcazou@adacore.com; steven@gcc.gnu.org; law@redhat.com
Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)

Hi Jiangning
On 3/15/19 4:46 AM, JiangNing OS wrote:
This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below,

if (...)
    x = a;  /* x is memory */
/* no else */

We can generate conditional move and remove the branch as below if the target cost is acceptable.

r1 = x
r2 = a
cmp ...
csel r3, r1, r2, cond
x = r3

This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided.

In practice, this optimization can improve a real application performance by %4 on aarch64.

Now that GCC 10 development is open, this should appropriate for considering.

I've cc'ed folks who are either listed maintainers in this area or have reviewed patches in this area in my recent memory.

Thanks,

Kyrill

Thanks,
-Jiangning

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-03-15  8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS
                   ` (2 preceding siblings ...)
  2019-05-24 14:54 ` Kyrill Tkachov
@ 2019-06-17 21:59 ` Jeff Law
  2019-06-20  9:53   ` JiangNing OS
  3 siblings, 1 reply; 15+ messages in thread
From: Jeff Law @ 2019-06-17 21:59 UTC (permalink / raw)
  To: JiangNing OS, gcc-patches

On 3/14/19 10:46 PM, JiangNing OS wrote:
> This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below,
> 
> if (...)
>     x = a;  /* x is memory */
> /* no else */
> 
> We can generate conditional move and remove the branch as below if the target cost is acceptable. 
> 
> r1 = x
> r2 = a
> cmp ...
> csel r3, r1, r2, cond
> x = r3
> 
> This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. 
So at a high level should we be doing this in gimple rather than RTL?
We're going to have a lot more information about types, better
infrastructure for looking at uses/defs, access to the alias oracle, we
should be able to accurately distinguish between potentially shared
objects vs those which are local to the thread, etc.  We lose the low
level costing information though.

I'm still going to go through the patch and do some level of review, but
I do think we need to answer the higher level question though.


>  
> Index: ifcvt.c
> ===================================================================
> --- ifcvt.c	(revision 269698)
> +++ ifcvt.c	(working copy)
> @@ -2029,6 +2045,93 @@
>    return true;
>  }
>  
> +/* Return true if MEM x is on stack. a_insn contains x, if it exists. */
Nits: We typically refer to parameters, variables, etc in comments using
upper case.  You'll need to review the entire patch for these its.

So perhaps the comment should be something like:

/* Return true of X, a MEM expression, is on the stack.  A_INSN contains
   X if A_INSN exists.  */


Just from a design standpoint, what are the consequences if this
function returns true for something that isn't actually in the stack or
false for something that is on the stack?



> +
> +static bool
> +noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x)
> +{
> +  df_ref use;
> +
> +  gcc_assert (x);
> +  gcc_assert (MEM_P (x));
> +
> +  /* Early exits if find base is a stack register. */
> +  rtx a = XEXP (x, 0);
> +  if (fixed_base_plus_p(a))
> +    return true;
Nit: Space between the function name and its open paren for arguments.  ie

if (fixed_base_plus_p (a)
                     ^
I see other instances of this nit, please review the patch and correct them.


> +
> +  if (!a_insn)
> +    return false;
I'm not sure what the calling profile is for this function, but this is
a cheaper test, so you might consider moving it before the test of
fixed_base_plus_p.


> +
> +  if (!reg_mentioned_p (x, a_insn))
> +    return false;
> +
> +  /* Check if x is on stack. Assume a mem expression using registers
> +     related to stack register is always on stack. */
> +  FOR_EACH_INSN_USE (use, a_insn)
> +    if (reg_mentioned_p (DF_REF_REG (use), x)
> +        && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
> +      return true;
> +
> +  return false;
> +}
So is X always a MEM?      Just wanted to make sure I understand.
reg_mentioned_p will do the right thing (using rtx_equal_p) for the
comparison.


> +
> +/* Always return true, if there is a dominating write.
> +   
> +   When there is a dominating read from memory on stack,
> +   1) if x = a is a memory read, return true.
> +   2) if x = a is a memory write, return true if the memory is on stack.
> +      This is the guarantee the memory is *not* readonly. */
> +
> +static bool
> +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
> +                           const_rtx x, bool is_store)
> +{
> +  rtx_insn *insn;
> +  rtx set;
> +
> +  gcc_assert (MEM_P (x));
> +
> +  FOR_BB_INSNS (bb, insn)
> +    {
> +      set = single_set (insn);
> +      if (!set)
> +        continue;
> +
> +      /* Dominating store */
> +      if (rtx_equal_p (x, SET_DEST (set)))
> +        return true;
> +
> +      /* Dominating load */
> +      if (rtx_equal_p (x, SET_SRC (set)))
> +        if (is_store && noce_mem_is_on_stack (a_insn, x))
> +          return true;
> +    }
> +
> +  return false;
> +}
So what would be the consequences here of returning false when in fact
there was a dominating read or write?  That could easily happen if the
dominating read or write was not a single_set.

I'm guessing that from a design standpoint you're trying to find cases
where you know an object was written to within the block and does not
escape.  So returning false when there was a dominating write is safe.
Returning true when there was not would be disastrous.  Right?





> @@ -5347,6 +5462,234 @@
>  
>    return FALSE;
>  }
> +
> +
> +/* Find all insns that must be stack address and store REGNO into
> +   bitmap bba_sets_must_be_sfp. */
> +
> +static void
> +find_all_must_be_sfp_insns (void)
> +{
> +  rtx_insn *a_insn;
> +  basic_block bb;
> +  bool sfp_found;
> +  auto_vec<int, 1> def_count;
> +  df_ref def, use;
> +
> +  /* Calculate def counts for each insn. */
> +  def_count.safe_grow_cleared (max_reg_num () + 1);
> +  FOR_ALL_BB_FN (bb, cfun)
> +    FOR_BB_INSNS (bb, a_insn)
> +      {
> +        if (!INSN_P (a_insn))
> +          continue;
> +
> +        FOR_EACH_INSN_DEF (def, a_insn)
> +          def_count[DF_REF_REGNO (def)]++;
> +      }Is there a reason why you can't use the information computed by reginfo?




> +
> +  /* Iterate all insns until finding all stack pointers, and store
> +     them into bba_sets_must_be_sfp. */
> +  bba_sets_must_be_sfp = BITMAP_ALLOC (&reg_obstack);
> +  do
> +    {
> +      sfp_found = false;
> +      FOR_ALL_BB_FN (bb, cfun)
> +        FOR_BB_INSNS (bb, a_insn)
> +          {
> +            rtx sset_a = single_set (a_insn);
> +            if (!sset_a)
> +              continue;
> +            rtx src = SET_SRC (sset_a);
> +            rtx dest = SET_DEST (sset_a);
So again, we can get things that aren't a single_set here and they'll be
ignored.  But I believe that's safe as you're trying to identify insns
which store stack addresses into a REG.  Missing a more complicated case
where a stack address is stored into a REG is safe.  Right?




> +
> +            if (!REG_P (dest))
> +              continue;
> +
> +            /* For the case below,
> +                 Control flow: B1->B3, B2->B3
> +                 B1: p = &local_var
> +                 B2: p = &global_var
> +                 B3: ... = *p
> +               pointer p is an address for either local or global variable.
> +               so we don't treat p as a stack address pointer. To make
> +               algorithm simple, we ignore all non-SSA cases. */
> +            bool skip_insn = false;
> +            unsigned int dest_regno = 0;
> +            FOR_EACH_INSN_DEF (def, a_insn)
> +              {
> +                dest_regno = DF_REF_REGNO (def);
> +                /* Skip current insn if
> +                   1) already marked as stack pointer.
> +                   2) or see multiple definition points. */
> +                if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno)
> +                    || def_count[dest_regno] > 1)
> +                  {
> +                    skip_insn = true;
> +                    break;
> +                  }
> +              }
> +            if (skip_insn)
> +              continue;
Right.  Again I wonder if you should be using the info computed by
regstat.  You can get single use information easily from that.

> +
> +            /* Handle case like "x1 = sp + offset". */
> +            if (fixed_base_plus_p (src))
> +              {
> +  	        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
> +                sfp_found = true;
> +                continue;
> +              }
Looks like your you've got an indentation problem here.  Most likely
you've got a case where you've got 8 spaces that should be replaced with
a tab.


> +
> +            /* Handle case like "x2 = x1 + offset", in which x1 is already
> +               identified as a stack pointer. */
> +            if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
> +                && CONST_INT_P (XEXP (src, 1)))
> +              {
> +                rtx x1 = XEXP (src, 0);
> +                if (!REG_P (x1))
> +                  continue;
> +
> +                FOR_EACH_INSN_USE (use, a_insn)
> +                  {
> +                    if (!rtx_equal_p (x1, DF_REF_REG (use)))
> +                      continue;
> +
> +                    if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
> +                      {
> +                        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
> +                        sfp_found = true;
> +                        break;
> +                      }
> +                  }
So how much is there to be gained from going from this kind of localized
computation to global?  It shouldn't be hard to turn this into a truly
global algorithm, but I'm not sure it's worth the effort.

I _do_ think it's worth visiting blocks in dominator order.  It's better
than what you're doing, but nowhere near as expensive as a global
propagation engine.

Is it worth handling simple copies in the code above?


> +  	      }
> +          }
> +    }
> +  while (sfp_found);
> +}
> +
> +/* Find all insns that may be stack pointer and store REGNO into
> +   bitmap bba_sets_may_be_sfp. We iterate all insns in current func
> +   until no more latent insns generating stack address are found.
> +   The collection of stack pointer bba_sets_may_be_sfp will be used
> +   to help analyze local stack variable address taken. Stack variable
> +   address can be passed out of current frame if only any stack pointer
> +   is passed to hard register or stored into memory. */
Shouldn't "may be stack pointer" be "must be stack pointer"?


> +
> +static void
> +find_all_may_be_sfp_insns (void)
> +{
> +  rtx_insn *a_insn;
> +  basic_block bb;
> +  bool sfp_found;
> +  bba_sets_may_be_sfp = BITMAP_ALLOC (&reg_obstack);
> +
> +  /* Iterate all insns that may be stack pointers, and store them into
> +     bitmap bba_sets_must_be_sfp. */
> +  do
> +    {
> +      sfp_found = false;
> +      FOR_ALL_BB_FN (bb, cfun)
Again, you may ultimately be better off with a dominator walk here.


> +        FOR_BB_INSNS (bb, a_insn)
> +          {
> +            /* Assume stack pointers can only be generated by single SET. */
> +            rtx sset_a = single_set (a_insn);
> +            if (!sset_a)
> +              continue;
> +            rtx src = SET_SRC (sset_a);
> +            rtx dest = SET_DEST (sset_a);
I'm not sure that's a safe assumption.


> +
> +            /* If we see "hard_register = ...", which implies passing out
> +               of current frame potentially, we don't collect this case.
> +               It can be treated as address taken in no_stack_address_taken */
> +            if (HARD_REGISTER_P (dest))
> +              continue;
> +
> +            /* Memory load and store are not to generate stack pointer. */
> +            if (MEM_P (src) || MEM_P (dest))
> +              continue;
> +
> +            /* Skip insn that is already identified as stack pointer. */
> +            bool skip_insn = false;
> +            df_ref def;
> +            unsigned int dest_regno = 0;
> +            FOR_EACH_INSN_DEF (def, a_insn)
> +              {
> +                dest_regno = DF_REF_REGNO (def);
> +                if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno))
> +                  {
> +                    skip_insn = true;
> +                    break;
> +                  }
> +              }
So earlier you've verified you've got a single set.  I'm not sure you
need an iterator over the defs.  Can't you just look at SET_DEST
(sset_a) which is conveniently in DEST?


I don't like the term "stack pointer" here -- most of us read "stack
pointer" and we think the register holding the current stack pointer.
Would "pointer into the stack" be more appropriate?


> +/* Return true if current function doesn't pass stack address out of frame. */
> +
> +static bool
> +no_stack_address_taken (void)
> +{
> +  basic_block bb;
> +  rtx_insn *a_insn;
> +  df_ref use;
> +
> +  FOR_ALL_BB_FN (bb, cfun)
> +    FOR_BB_INSNS (bb, a_insn)
> +      {
> +        if (!INSN_P (a_insn))
> +          continue;
> +
> +        rtx sset_a = single_set (a_insn);
> +        if (!sset_a)
> +          continue;
> +        rtx src = SET_SRC (sset_a);
> +        rtx dest = SET_DEST (sset_a);
> +
> +        /* Skip insn that is already identified as frame pointers. */
> +        if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
> +          continue;
> +                    
> +        FOR_EACH_INSN_USE (use, a_insn)
> +          {
> +            /* Check if current insn is using any stack pointer. Ignore
> +               if it doesn't use frame pointers at all. */
> +            if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
> +              continue;
> +
> +            /* Load cannot introduce address taken. */
> +            if (MEM_P (src))
> +              continue;
> +
> +            /* Store is safe if only the src doesn't contain stack pointer. */
> +            if (MEM_P (dest)
> +                && !reg_mentioned_p (DF_REF_REG (use), src))
> +              continue;
> +
> +            /* All other cases potentially introduce address taken. */
> +            return false;
> +          }
So what if you have a PARALLEL where one entry causes an escape of a
pointer into the stack and another entry does some other operation?  If
so, don't you miss the fact that you've got an escaping pointer into the
stack?  I don't think you can restrict yourself to single_sets here.

Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-06-17 21:59 ` Jeff Law
@ 2019-06-20  9:53   ` JiangNing OS
  2019-06-20 23:13     ` Jeff Law
  0 siblings, 1 reply; 15+ messages in thread
From: JiangNing OS @ 2019-06-20  9:53 UTC (permalink / raw)
  To: Jeff Law, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 16040 bytes --]

Hi Jeff,

Appreciate your effort to review my patch! I've updated my patch as attached. See my answers below.

> in current function, so the store speculation can be avoided.
> So at a high level should we be doing this in gimple rather than RTL?
> We're going to have a lot more information about types, better
> infrastructure for looking at uses/defs, access to the alias oracle, we should
> be able to accurately distinguish between potentially shared objects vs those
> which are local to the thread, etc.  We lose the low level costing information
> though.
> 
> I'm still going to go through the patch and do some level of review, but I do
> think we need to answer the higher level question though.
> 
I have the following reasons,

1) Following the clue Richard B gave me before about parameter --param allow-store-data-races,
I did check the middle-end pass tree-if-conv, but I think this pass at the moment doesn't work
for the issue I'm trying to solve. Tree-if-conv is to do if conversion for loop, and its final goal is to
help loop vectorization, while my case doesn't have a loop at all. 
2) My current solution fits into current back-end if-conversion pass very well. I don't want to invent
a new framework to solve this relatively small issue. Besides, this back-end patch doesn't only
enhance store speculation detection, but also fix a bug in the original code. 

> Nits: We typically refer to parameters, variables, etc in comments using
> upper case.  You'll need to review the entire patch for these its.
> 
> So perhaps the comment should be something like:
> 
> /* Return true of X, a MEM expression, is on the stack.  A_INSN contains
>    X if A_INSN exists.  */
> 
Fixed in attached new patch.

> 
> Just from a design standpoint, what are the consequences if this function
> returns true for something that isn't actually in the stack or false for
> something that is on the stack?
> 
If noce_mem_is_on_stack returns true for something that isn't actually in the stack, 
it could potentially introduce store speculation, then the if-conversion optimization
will be incorrect. If this function returns false for something that is on stack, it doesn't
matter, because the optimization will not be triggered. 

> Nit: Space between the function name and its open paren for arguments.  ie
> 
> if (fixed_base_plus_p (a)
>                      ^
> I see other instances of this nit, please review the patch and correct them.
> 
Fixed in attached new patch.

> 
> > +
> > +  if (!a_insn)
> > +    return false;
> I'm not sure what the calling profile is for this function, but this is a cheaper
> test, so you might consider moving it before the test of fixed_base_plus_p.
> 
Fixed in attached new patch.

> 
> > +
> > +  if (!reg_mentioned_p (x, a_insn))
> > +    return false;
> > +
> > +  /* Check if x is on stack. Assume a mem expression using registers
> > +     related to stack register is always on stack. */
> > + FOR_EACH_INSN_USE (use, a_insn)
> > +    if (reg_mentioned_p (DF_REF_REG (use), x)
> > +        && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
> > +      return true;
> > +
> > +  return false;
> > +}
> So is X always a MEM?      Just wanted to make sure I understand.
> reg_mentioned_p will do the right thing (using rtx_equal_p) for the
> comparison.
> 
Yes. X is always a MEM. There is an assertion for this in the code above.

> 
> > +
> > +/* Always return true, if there is a dominating write.
> > +
> > +   When there is a dominating read from memory on stack,
> > +   1) if x = a is a memory read, return true.
> > +   2) if x = a is a memory write, return true if the memory is on stack.
> > +      This is the guarantee the memory is *not* readonly. */
> > +
> > +static bool
> > +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
> > +                           const_rtx x, bool is_store) {
> > +  rtx_insn *insn;
> > +  rtx set;
> > +
> > +  gcc_assert (MEM_P (x));
> > +
> > +  FOR_BB_INSNS (bb, insn)
> > +    {
> > +      set = single_set (insn);
> > +      if (!set)
> > +        continue;
> > +
> > +      /* Dominating store */
> > +      if (rtx_equal_p (x, SET_DEST (set)))
> > +        return true;
> > +
> > +      /* Dominating load */
> > +      if (rtx_equal_p (x, SET_SRC (set)))
> > +        if (is_store && noce_mem_is_on_stack (a_insn, x))
> > +          return true;
> > +    }
> > +
> > +  return false;
> > +}
> So what would be the consequences here of returning false when in fact
> there was a dominating read or write?  That could easily happen if the
> dominating read or write was not a single_set.
If return false when in fact there is a dominating read or write, the optimization will not
be triggered, because noce_mem_maybe_invalid_p will be true, which is playing the same
role as may_trap_or_fault_p.

> 
> I'm guessing that from a design standpoint you're trying to find cases where
> you know an object was written to within the block and does not escape.  So
> returning false when there was a dominating write is safe.
> Returning true when there was not would be disastrous.  Right?
> 
YES! You've completely understood my code! 😊

> 
> 
> 
> 
> > @@ -5347,6 +5462,234 @@
> >
> >    return FALSE;
> >  }
> > +
> > +
> > +/* Find all insns that must be stack address and store REGNO into
> > +   bitmap bba_sets_must_be_sfp. */
> > +
> > +static void
> > +find_all_must_be_sfp_insns (void)
> > +{
> > +  rtx_insn *a_insn;
> > +  basic_block bb;
> > +  bool sfp_found;
> > +  auto_vec<int, 1> def_count;
> > +  df_ref def, use;
> > +
> > +  /* Calculate def counts for each insn. */
> > + def_count.safe_grow_cleared (max_reg_num () + 1);  FOR_ALL_BB_FN
> > + (bb, cfun)
> > +    FOR_BB_INSNS (bb, a_insn)
> > +      {
> > +        if (!INSN_P (a_insn))
> > +          continue;
> > +
> > +        FOR_EACH_INSN_DEF (def, a_insn)
> > +          def_count[DF_REF_REGNO (def)]++;
> > +      }Is there a reason why you can't use the information computed by
> reginfo?
> 
Fixed in attached new patch.

> 
> 
> 
> > +
> > +  /* Iterate all insns until finding all stack pointers, and store
> > +     them into bba_sets_must_be_sfp. */  bba_sets_must_be_sfp =
> > + BITMAP_ALLOC (&reg_obstack);  do
> > +    {
> > +      sfp_found = false;
> > +      FOR_ALL_BB_FN (bb, cfun)
> > +        FOR_BB_INSNS (bb, a_insn)
> > +          {
> > +            rtx sset_a = single_set (a_insn);
> > +            if (!sset_a)
> > +              continue;
> > +            rtx src = SET_SRC (sset_a);
> > +            rtx dest = SET_DEST (sset_a);
> So again, we can get things that aren't a single_set here and they'll be
> ignored.  But I believe that's safe as you're trying to identify insns which store
> stack addresses into a REG.  Missing a more complicated case where a stack
> address is stored into a REG is safe.  Right?
> 
Yes. Missing some pointers to stack is safe here, because we will use it to check if a memory
in if-conversion candidate doesn't have store speculation. If we miss it, the optimization will
not be triggered, so there will not be risky.

> 
> 
> 
> > +
> > +            if (!REG_P (dest))
> > +              continue;
> > +
> > +            /* For the case below,
> > +                 Control flow: B1->B3, B2->B3
> > +                 B1: p = &local_var
> > +                 B2: p = &global_var
> > +                 B3: ... = *p
> > +               pointer p is an address for either local or global variable.
> > +               so we don't treat p as a stack address pointer. To make
> > +               algorithm simple, we ignore all non-SSA cases. */
> > +            bool skip_insn = false;
> > +            unsigned int dest_regno = 0;
> > +            FOR_EACH_INSN_DEF (def, a_insn)
> > +              {
> > +                dest_regno = DF_REF_REGNO (def);
> > +                /* Skip current insn if
> > +                   1) already marked as stack pointer.
> > +                   2) or see multiple definition points. */
> > +                if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno)
> > +                    || def_count[dest_regno] > 1)
> > +                  {
> > +                    skip_insn = true;
> > +                    break;
> > +                  }
> > +              }
> > +            if (skip_insn)
> > +              continue;
> Right.  Again I wonder if you should be using the info computed by regstat.
> You can get single use information easily from that.
> 
Yes. Fixed in attached new patch.

> > +
> > +            /* Handle case like "x1 = sp + offset". */
> > +            if (fixed_base_plus_p (src))
> > +              {
> > +  	        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
> > +                sfp_found = true;
> > +                continue;
> > +              }
> Looks like your you've got an indentation problem here.  Most likely you've
> got a case where you've got 8 spaces that should be replaced with a tab.
> 
Fixed in attached new patch.

> 
> > +
> > +            /* Handle case like "x2 = x1 + offset", in which x1 is already
> > +               identified as a stack pointer. */
> > +            if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
> > +                && CONST_INT_P (XEXP (src, 1)))
> > +              {
> > +                rtx x1 = XEXP (src, 0);
> > +                if (!REG_P (x1))
> > +                  continue;
> > +
> > +                FOR_EACH_INSN_USE (use, a_insn)
> > +                  {
> > +                    if (!rtx_equal_p (x1, DF_REF_REG (use)))
> > +                      continue;
> > +
> > +                    if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO
> (use)))
> > +                      {
> > +                        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
> > +                        sfp_found = true;
> > +                        break;
> > +                      }
> > +                  }
> So how much is there to be gained from going from this kind of localized
> computation to global?  It shouldn't be hard to turn this into a truly global
> algorithm, but I'm not sure it's worth the effort.
> 
> I _do_ think it's worth visiting blocks in dominator order.  It's better than
> what you're doing, but nowhere near as expensive as a global propagation
> engine.
> 
Fixed in attached new patch using dom_walk class. Please review again.

> Is it worth handling simple copies in the code above?
I thought it less likely to have a copy for pointer to stack before register allocation. Right?

> 
> 
> > +  	      }
> > +          }
> > +    }
> > +  while (sfp_found);
> > +}
> > +
> > +/* Find all insns that may be stack pointer and store REGNO into
> > +   bitmap bba_sets_may_be_sfp. We iterate all insns in current func
> > +   until no more latent insns generating stack address are found.
> > +   The collection of stack pointer bba_sets_may_be_sfp will be used
> > +   to help analyze local stack variable address taken. Stack variable
> > +   address can be passed out of current frame if only any stack pointer
> > +   is passed to hard register or stored into memory. */
> Shouldn't "may be stack pointer" be "must be stack pointer"?
> 
There is a typo in the comments. Fixed in attached new patch.

> 
> > +
> > +static void
> > +find_all_may_be_sfp_insns (void)
> > +{
> > +  rtx_insn *a_insn;
> > +  basic_block bb;
> > +  bool sfp_found;
> > +  bba_sets_may_be_sfp = BITMAP_ALLOC (&reg_obstack);
> > +
> > +  /* Iterate all insns that may be stack pointers, and store them into
> > +     bitmap bba_sets_must_be_sfp. */
> > +  do
> > +    {
> > +      sfp_found = false;
> > +      FOR_ALL_BB_FN (bb, cfun)
> Again, you may ultimately be better off with a dominator walk here.
Yes. Fixed in attached new patch.

> 
> 
> > +        FOR_BB_INSNS (bb, a_insn)
> > +          {
> > +            /* Assume stack pointers can only be generated by single SET. */
> > +            rtx sset_a = single_set (a_insn);
> > +            if (!sset_a)
> > +              continue;
> > +            rtx src = SET_SRC (sset_a);
> > +            rtx dest = SET_DEST (sset_a);
> I'm not sure that's a safe assumption.
Fixed in attached new patch.

> 
> 
> > +
> > +            /* If we see "hard_register = ...", which implies passing out
> > +               of current frame potentially, we don't collect this case.
> > +               It can be treated as address taken in no_stack_address_taken */
> > +            if (HARD_REGISTER_P (dest))
> > +              continue;
> > +
> > +            /* Memory load and store are not to generate stack pointer. */
> > +            if (MEM_P (src) || MEM_P (dest))
> > +              continue;
> > +
> > +            /* Skip insn that is already identified as stack pointer. */
> > +            bool skip_insn = false;
> > +            df_ref def;
> > +            unsigned int dest_regno = 0;
> > +            FOR_EACH_INSN_DEF (def, a_insn)
> > +              {
> > +                dest_regno = DF_REF_REGNO (def);
> > +                if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno))
> > +                  {
> > +                    skip_insn = true;
> > +                    break;
> > +                  }
> > +              }
> So earlier you've verified you've got a single set.  I'm not sure you need an
> iterator over the defs.  Can't you just look at SET_DEST
> (sset_a) which is conveniently in DEST?
Fixed in attached new patch.

> 
> 
> I don't like the term "stack pointer" here -- most of us read "stack pointer"
> and we think the register holding the current stack pointer.
> Would "pointer into the stack" be more appropriate?
> 
Fixed in attached new patch.

> 
> > +/* Return true if current function doesn't pass stack address out of
> > +frame. */
> > +
> > +static bool
> > +no_stack_address_taken (void)
> > +{
> > +  basic_block bb;
> > +  rtx_insn *a_insn;
> > +  df_ref use;
> > +
> > +  FOR_ALL_BB_FN (bb, cfun)
> > +    FOR_BB_INSNS (bb, a_insn)
> > +      {
> > +        if (!INSN_P (a_insn))
> > +          continue;
> > +
> > +        rtx sset_a = single_set (a_insn);
> > +        if (!sset_a)
> > +          continue;
> > +        rtx src = SET_SRC (sset_a);
> > +        rtx dest = SET_DEST (sset_a);
> > +
> > +        /* Skip insn that is already identified as frame pointers. */
> > +        if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
> > +          continue;
> > +
> > +        FOR_EACH_INSN_USE (use, a_insn)
> > +          {
> > +            /* Check if current insn is using any stack pointer. Ignore
> > +               if it doesn't use frame pointers at all. */
> > +            if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
> > +              continue;
> > +
> > +            /* Load cannot introduce address taken. */
> > +            if (MEM_P (src))
> > +              continue;
> > +
> > +            /* Store is safe if only the src doesn't contain stack pointer. */
> > +            if (MEM_P (dest)
> > +                && !reg_mentioned_p (DF_REF_REG (use), src))
> > +              continue;
> > +
> > +            /* All other cases potentially introduce address taken. */
> > +            return false;
> > +          }
> So what if you have a PARALLEL where one entry causes an escape of a
> pointer into the stack and another entry does some other operation?  If so,
> don't you miss the fact that you've got an escaping pointer into the stack?  I
> don't think you can restrict yourself to single_sets here.
Yes. You are right. I have added code to handle PARALLEL case. I handle it in a rather conservative way though. Review again, please!

> 
> Jeff

Thanks,
-Jiangning

[-- Attachment #2: csel4.patch --]
[-- Type: application/octet-stream, Size: 20111 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c75b08e..59c8dd6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,25 @@
+2019-06-18  Jiangning Liu  <jiangning@os.amperecomputing.com>
+
+	PR rtl-optimization/89430
+	* cse.c (fixed_base_plus_p): Move from here...
+	* rtlanal.c (fixed_base_plus_p): ...to here...
+	* rtl.h (fixed_base_plus_p): Add extern declaration.
+	* ifcvt.c
+	(noce_process_if_block): For no else_bb and memory store ifcvt case,
+	early exit if only it is global and current function has address
+	taken.
+	(noce_try_cmove_arith): Do ifcvt for no else_bb case, if the variable
+	is on stack and dominating access can prove the newly introduce memory
+	access is safe.
+	(find_all_must_be_sfp_insns): New function.
+	(find_all_may_be_sfp_insns): New function.
+	(no_stack_address_taken): New function.
+	(no_need_to_analyze_sfp): New function.
+	(noce_mem_is_on_stack): New function.
+	(noce_valid_for_dominating): New function.
+	* ifcvt.h : Add new fields.
+	* testsuite/gcc.dg/ifcvt-6.c: New.
+
 2019-06-17  Jakub Jelinek  <jakub@redhat.com>
 
 	* omp-low.c (struct omp_context): Add scan_inclusive field.
diff --git a/gcc/cse.c b/gcc/cse.c
index 35840a6d..a6fcf92 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -540,7 +540,6 @@ static bitmap cse_ebb_live_in, cse_ebb_live_out;
    already as part of an already processed extended basic block.  */
 static sbitmap cse_visited_basic_blocks;
 
-static bool fixed_base_plus_p (rtx x);
 static int notreg_cost (rtx, machine_mode, enum rtx_code, int);
 static int preferable (int, int, int, int);
 static void new_basic_block (void);
@@ -606,29 +605,6 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
 
 static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
 \f
-/* Nonzero if X has the form (PLUS frame-pointer integer).  */
-
-static bool
-fixed_base_plus_p (rtx x)
-{
-  switch (GET_CODE (x))
-    {
-    case REG:
-      if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx)
-	return true;
-      if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM])
-	return true;
-      return false;
-
-    case PLUS:
-      if (!CONST_INT_P (XEXP (x, 1)))
-	return false;
-      return fixed_base_plus_p (XEXP (x, 0));
-
-    default:
-      return false;
-    }
-}
 
 /* Dump the expressions in the equivalence class indicated by CLASSP.
    This function is used only for debugging.  */
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 7b2f6e6..d04040e 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -46,6 +46,7 @@
 #include "rtl-iter.h"
 #include "ifcvt.h"
 #include "params.h"
+#include "domwalk.h"
 
 #ifndef MAX_CONDITIONAL_EXECUTE
 #define MAX_CONDITIONAL_EXECUTE \
@@ -76,6 +77,18 @@ static int num_true_changes;
 /* Whether conditional execution changes were made.  */
 static int cond_exec_changed_p;
 
+/* REGNO bitmap for defs that may be stack frame address.  */
+static bitmap bba_sets_may_be_sfp;
+
+/* REGNO bitmap for defs that must be stack frame address.  */
+static bitmap bba_sets_must_be_sfp;
+
+/* true if current function doesn't have local address taken.  */
+static bool cfun_no_stack_address_taken;
+
+/* A flag to indicate if a pointer to stack is found.  */
+static bool sfp_found;
+
 /* Forward references.  */
 static int count_bb_insns (const_basic_block);
 static bool cheap_bb_rtx_cost_p (const_basic_block, profile_probability, int);
@@ -99,6 +112,14 @@ static int dead_or_predicable (basic_block, basic_block, basic_block,
 			       edge, int);
 static void noce_emit_move_insn (rtx, rtx);
 static rtx_insn *block_has_only_trap (basic_block);
+static bool noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x);
+static bool noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
+				       const_rtx x, bool is_store);
+static bool noce_mem_maybe_invalid_p (struct noce_if_info *if_info);
+static bool no_need_to_analyze_sfp (const_rtx set);
+static void find_all_must_be_sfp_insns (void);
+static void find_all_may_be_sfp_insns (void);
+static bool no_stack_address_taken (void);
 \f
 /* Count the number of non-jump active insns in BB.  */
 
@@ -2029,6 +2050,94 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
   return true;
 }
 
+/* Return true of X, a MEM expression, is on the stack.  A_INSN contains
+   X if A_INSN exists.  */
+
+static bool
+noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x)
+{
+  df_ref use;
+
+  gcc_assert (x);
+  gcc_assert (MEM_P (x));
+
+  if (!a_insn)
+    return false;
+
+  /* Early exits if find base is a stack register.  */
+  rtx a = XEXP (x, 0);
+  if (fixed_base_plus_p (a))
+    return true;
+
+  if (!reg_mentioned_p (x, a_insn))
+    return false;
+
+  /* Check if x is on stack.  Assume a mem expression using registers
+     related to stack register is always on stack.  */
+  FOR_EACH_INSN_USE (use, a_insn)
+    if (reg_mentioned_p (DF_REF_REG (use), x)
+	&& bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
+      return true;
+
+  return false;
+}
+
+/* Always return true, if there is a dominating store.
+
+   When there is a dominating load from memory on stack,
+   1) if A_INSN is a memory read, return true.
+   2) if A_INSN is a memory write, return true if the memory is on stack.
+      This is to guarantee the memory is *not* readonly.  */
+
+static bool
+noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
+			   const_rtx x, bool is_store)
+{
+  rtx_insn *insn;
+  rtx set;
+
+  gcc_assert (MEM_P (x));
+
+  FOR_BB_INSNS (bb, insn)
+    {
+      set = single_set (insn);
+      if (!set)
+	continue;
+
+      /* Dominating store.  */
+      if (rtx_equal_p (x, SET_DEST (set)))
+	return true;
+
+      /* Dominating load.  */
+      if (rtx_equal_p (x, SET_SRC (set)))
+	if (is_store && noce_mem_is_on_stack (a_insn, x))
+	  return true;
+    }
+
+  return false;
+}
+
+/* Return false if the memory A or B must be valid.
+   This function must be called before latent swap of A and B.  */
+
+static bool
+noce_mem_maybe_invalid_p (struct noce_if_info *if_info)
+{
+  if (!if_info->set_b && MEM_P (if_info->orig_x))
+    {
+      if (!if_info->else_bb && MEM_P (if_info->b))
+	return !noce_valid_for_dominating (if_info->test_bb,
+					   if_info->insn_a,
+					   if_info->b, true);
+    }
+
+  /* ??? We could handle this if we knew that a load from A or B could
+     not trap or fault.  This is also true if we've already loaded
+     from the address along the path from ENTRY.  */
+  return (may_trap_or_fault_p (if_info->a)
+	  || may_trap_or_fault_p (if_info->b));
+}
+
 /* Try more complex cases involving conditional_move.  */
 
 static int
@@ -2065,10 +2174,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
       is_mem = 1;
     }
 
-  /* ??? We could handle this if we knew that a load from A or B could
-     not trap or fault.  This is also true if we've already loaded
-     from the address along the path from ENTRY.  */
-  else if (may_trap_or_fault_p (a) || may_trap_or_fault_p (b))
+  else if (noce_mem_maybe_invalid_p (if_info))
     return FALSE;
 
   /* if (test) x = a + b; else x = c - d;
@@ -2234,7 +2340,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
   /* If insn to set up A clobbers any registers B depends on, try to
      swap insn that sets up A with the one that sets up B.  If even
      that doesn't help, punt.  */
-  if (modified_in_a && !modified_in_b)
+  if (!modified_in_a && modified_in_b)
     {
       if (!noce_emit_bb (emit_b, else_bb, b_simple))
 	goto end_seq_and_fail;
@@ -2242,7 +2348,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
       if (!noce_emit_bb (emit_a, then_bb, a_simple))
 	goto end_seq_and_fail;
     }
-  else if (!modified_in_a)
+  else if (!modified_in_b)
     {
       if (!noce_emit_bb (emit_a, then_bb, a_simple))
 	goto end_seq_and_fail;
@@ -3563,12 +3669,27 @@ noce_process_if_block (struct noce_if_info *if_info)
     }
 
   if (!set_b && MEM_P (orig_x))
-    /* We want to avoid store speculation to avoid cases like
-	 if (pthread_mutex_trylock(mutex))
-	   ++global_variable;
-       Rather than go to much effort here, we rely on the SSA optimizers,
-       which do a good enough job these days.  */
-    return FALSE;
+    {
+      /* We want to avoid store speculation to avoid cases like
+	   if (pthread_mutex_trylock (mutex))
+	     ++global_variable;
+	 Tree if conversion cannot handle this case well, and it intends to
+	 help vectorization for loops only.  */
+      if (!noce_mem_is_on_stack (insn_a, orig_x))
+	return FALSE;
+
+      /* For case like,
+	   if (pthread_mutex_trylock (mutex))
+	     ++local_variable;
+	 If any stack variable address is taken, potentially this local
+	 variable could be modified by other threads and introduce store
+	 speculation.  */
+      if (!cfun_no_stack_address_taken)
+	return FALSE;
+    }
+
+  if_info->set_a = set_a;
+  if_info->set_b = set_b;
 
   if (noce_try_move (if_info))
     goto success;
@@ -5347,6 +5468,297 @@ dead_or_predicable (basic_block test_bb, basic_block merge_bb,
 
   return FALSE;
 }
+
+class must_be_sfp_dom_walker : public dom_walker
+{
+public:
+  must_be_sfp_dom_walker (cdi_direction direction) : dom_walker (direction)
+    {}
+
+  virtual edge before_dom_children (basic_block);
+};
+
+edge
+must_be_sfp_dom_walker::before_dom_children (basic_block bb)
+{
+  rtx_insn *a_insn;
+
+  FOR_BB_INSNS (bb, a_insn)
+    {
+      df_ref def, use;
+
+      rtx sset_a = single_set (a_insn);
+      if (!sset_a)
+	continue;
+      rtx src = SET_SRC (sset_a);
+      rtx dest = SET_DEST (sset_a);
+
+      if (!REG_P (dest))
+	continue;
+
+      /* For the case below,
+	   Control flow: B1->B3, B2->B3
+	   B1: p = &local_var
+	   B2: p = &global_var
+	   B3: ... = *p
+	 pointer p is an address for either local or global variable.
+	 so we don't treat p as a stack address pointer.  To make
+	 algorithm simple, we ignore all non-SSA cases.  */
+      bool skip_insn = false;
+      unsigned int dest_regno = 0;
+      FOR_EACH_INSN_DEF (def, a_insn)
+	{
+	  dest_regno = DF_REF_REGNO (def);
+	  /* Skip current insn if
+	     1) it is already marked as a pointer to the stack.
+	     2) or we see multiple definition points.  */
+	  if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno)
+	      || REG_N_SETS (dest_regno) > 1)
+	    {
+	      skip_insn = true;
+	      break;
+	    }
+	}
+      if (skip_insn)
+	continue;
+
+      /* Handle case like "x1 = sp + offset".  */
+      if (fixed_base_plus_p (src))
+	{
+	  bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
+	  sfp_found = true;
+	  continue;
+	}
+
+      /* Handle case like "x2 = x1 + offset", in which x1 is already
+	 identified as a pointer to the stack.  */
+      if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
+	  && CONST_INT_P (XEXP (src, 1)))
+	{
+	  rtx x1 = XEXP (src, 0);
+	  if (!REG_P (x1))
+	    continue;
+
+	  FOR_EACH_INSN_USE (use, a_insn)
+	    {
+	      if (!rtx_equal_p (x1, DF_REF_REG (use)))
+		continue;
+
+	      if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use)))
+		{
+		  bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
+		  sfp_found = true;
+		  break;
+		}
+	    }
+	}
+    }
+  return NULL;
+}
+
+/* Find all insns that must be stack address and store REGNO into
+   bitmap BBA_SETS_MUST_BE_SFP.  */
+
+static void
+find_all_must_be_sfp_insns (void)
+{
+  do {
+    sfp_found = false;
+    must_be_sfp_dom_walker (CDI_DOMINATORS)
+      .walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+  } while (sfp_found);
+}
+
+class may_be_sfp_dom_walker : public dom_walker
+{
+public:
+  may_be_sfp_dom_walker (cdi_direction direction) : dom_walker (direction)
+    {}
+
+  virtual edge before_dom_children (basic_block);
+};
+
+/* Return true if we are sure SET doesn't need to be further analyzed
+   to calcluate BBA_SETS_MAY_BE_SFP.  */
+
+static bool no_need_to_analyze_sfp (const_rtx set)
+{
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+
+  /* Skip insn that is already identified as a pointer to the stack.  */
+  unsigned int dest_regno = REGNO (dest);
+  if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno))
+    return true;
+
+  /* If we see "hard_register = ...", which implies passing out
+     current frame potentially, we don't collect this case.
+     can be treated as address taken in no_stack_address_taken.  */
+  if (HARD_REGISTER_P (dest))
+    return true;
+
+  /* Memory load and store are not to generate a pointer to the stack.  */
+  if (MEM_P (src) || MEM_P (dest))
+    return true;
+
+  return false;
+}
+
+edge
+may_be_sfp_dom_walker::before_dom_children (basic_block bb)
+{
+  rtx_insn *a_insn;
+  const_rtx pat;
+
+  FOR_BB_INSNS (bb, a_insn)
+    {
+      if (!INSN_P (a_insn))
+	continue;
+
+      pat = PATTERN (a_insn);
+      if (GET_CODE (pat) == SET)
+	{
+	  if (no_need_to_analyze_sfp (pat))
+	    continue;
+
+	  /* Collect all latent insns that generate pointers to the stack.  */
+	  df_ref use;
+    	  FOR_EACH_INSN_USE (use, a_insn)
+	    {
+	      rtx x = DF_REF_REG (use);
+	      if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx
+		  || bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
+		{
+		  bitmap_set_bit (bba_sets_may_be_sfp, REGNO (SET_DEST (pat)));
+		  sfp_found = true;
+		  break;
+		}
+	    }
+	}
+      else if (GET_CODE (pat) == PARALLEL)
+	{
+	  for (int i = 0; i < XVECLEN (pat, 0); i++)
+	    {
+	      rtx sub = XVECEXP (pat, 0, i);
+	      if (GET_CODE (sub) == SET)
+		{
+		  if (no_need_to_analyze_sfp (sub))
+		    continue;
+
+		  /* Aggressively mark it as a pointer to the stack to
+		     avoid any local stack address to escape out of current
+		     frame.  */
+		  bitmap_set_bit (bba_sets_may_be_sfp,
+				  REGNO (SET_DEST (sub)));
+		  sfp_found = true;
+		}
+	    }
+	}
+    }
+
+  return NULL;
+}
+
+/* Find all insns that may be pointers to the stack and store REGNO into
+   bitmap BBA_SETS_MAY_BE_SFP.  We iterate all insns in current func
+   until no more latent insns generating stack address are found.  The
+   collection of pointers to the stack BBA_SETS_MAY_BE_SFP will be used
+   to help analyze local stack variable address taken.  Stack variable
+   address can be passed out of current frame if only any pointer to the
+   stack is passed to hard register or stored into memory.  */
+
+static void
+find_all_may_be_sfp_insns (void)
+{
+  do {
+    sfp_found = false;
+    may_be_sfp_dom_walker (CDI_DOMINATORS)
+      .walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+  } while (sfp_found);
+}
+
+/* Return true if current function doesn't pass stack address out of frame.  */
+
+static bool
+no_stack_address_taken (void)
+{
+  basic_block bb;
+  rtx_insn *a_insn;
+  const_rtx pat;
+  df_ref use;
+  rtx src, dest;
+
+  FOR_ALL_BB_FN (bb, cfun)
+    FOR_BB_INSNS (bb, a_insn)
+      {
+	if (!INSN_P (a_insn))
+	  continue;
+
+	pat = PATTERN (a_insn);
+	if (GET_CODE (pat) == SET)
+	  {
+	    src = SET_SRC (pat);
+	    dest = SET_DEST (pat);
+
+	    /* Skip if it is already identified as pointers to the stack.  */
+	    if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
+	      continue;
+
+	    /* Load cannot introduce address taken.  */
+	    if (MEM_P (src))
+	      continue;
+
+	    FOR_EACH_INSN_USE (use, a_insn)
+	      {
+		/* Skip if it doesn't use any pointers to the stack at all.  */
+		if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
+		  continue;
+
+		/* Store is safe if src doesn't contain any pointers to the
+		   stack.  */
+		if (MEM_P (dest)
+		    && !reg_mentioned_p (DF_REF_REG (use), src))
+		  continue;
+
+		/* All other cases potentially introduce address taken.  */
+		return false;
+	      }
+	  }
+	else if (GET_CODE (pat) == PARALLEL)
+	  {
+	    for (int i = 0; i < XVECLEN (pat, 0); i++)
+	      {
+		rtx sub = XVECEXP (pat, 0, i);
+		src = SET_SRC (sub);
+		dest = SET_DEST (sub);
+
+		if (GET_CODE (sub) == SET)
+		  {
+	    	    /* Skip if it is already identified as pointers to
+		       the stack.  */
+		    if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
+		      continue;
+
+		    /* Load cannot introduce address taken.  */
+		    if (MEM_P (src))
+		      continue;
+
+		    /* For all other cases, conservatively treat it as
+		       assdress taken if only a pointer to the stack is
+		       used.  */
+		    FOR_EACH_INSN_USE (use, a_insn)
+		      {
+			if (bitmap_bit_p (bba_sets_may_be_sfp,
+					  DF_REF_REGNO (use)))
+			  return false;
+		      }
+		  }
+	      }
+	  }
+      }
+  return true;
+}
+
 \f
 /* Main entry point for all if-conversion.  AFTER_COMBINE is true if
    we are after combine pass.  */
@@ -5381,6 +5793,18 @@ if_convert (bool after_combine)
 
   df_set_flags (DF_LR_RUN_DCE);
 
+  bba_sets_must_be_sfp = BITMAP_ALLOC (&reg_obstack);
+  bba_sets_may_be_sfp = BITMAP_ALLOC (&reg_obstack);
+
+  /* Prepare for stack variable anslysis.  */
+  regstat_init_n_sets_and_refs ();
+  calculate_dominance_info (CDI_DOMINATORS);
+  find_all_must_be_sfp_insns ();
+  find_all_may_be_sfp_insns ();
+  cfun_no_stack_address_taken = no_stack_address_taken ();
+  free_dominance_info (CDI_DOMINATORS);
+  regstat_free_n_sets_and_refs ();
+
   /* Go through each of the basic blocks looking for things to convert.  If we
      have conditional execution, we make multiple passes to allow us to handle
      IF-THEN{-ELSE} blocks within other IF-THEN{-ELSE} blocks.  */
@@ -5413,6 +5837,9 @@ if_convert (bool after_combine)
     }
   while (cond_exec_changed_p);
 
+  BITMAP_FREE (bba_sets_may_be_sfp);
+  BITMAP_FREE (bba_sets_must_be_sfp);
+
 #ifdef IFCVT_MULTIPLE_DUMPS
   if (dump_file)
     fprintf (dump_file, "\n\n========== no more changes\n");
diff --git a/gcc/ifcvt.h b/gcc/ifcvt.h
index 153ad96..46a32ad 100644
--- a/gcc/ifcvt.h
+++ b/gcc/ifcvt.h
@@ -73,6 +73,8 @@ struct noce_if_info
 
   /* The SET_SRC of INSN_A and INSN_B.  */
   rtx a, b;
+  /* The SET of INSN_A and INSN_B.  */
+  rtx set_a, set_b;
 
   /* The SET_DEST of INSN_A.  */
   rtx x;
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 31fba82..5c5e018 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -3754,6 +3754,8 @@ extern struct target_rtl *this_target_rtl;
 #define hard_frame_pointer_rtx	(global_rtl[GR_HARD_FRAME_POINTER])
 #define arg_pointer_rtx		(global_rtl[GR_ARG_POINTER])
 
+extern bool fixed_base_plus_p (rtx x);
+
 #ifndef GENERATOR_FILE
 /* Return the attributes of a MEM rtx.  */
 static inline const struct mem_attrs *
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index 268a387..48c5c2f 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -341,6 +341,30 @@ rtx_varies_p (const_rtx x, bool for_alias)
   return 0;
 }
 
+/* Nonzero if X has the form (PLUS frame-pointer integer).  */
+
+bool
+fixed_base_plus_p (rtx x)
+{
+  switch (GET_CODE (x))
+    {
+    case REG:
+      if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx)
+	return true;
+      if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM])
+	return true;
+      return false;
+
+    case PLUS:
+      if (!CONST_INT_P (XEXP (x, 1)))
+	return false;
+      return fixed_base_plus_p (XEXP (x, 0));
+
+    default:
+      return false;
+    }
+}
+
 /* Compute an approximation for the offset between the register
    FROM and TO for the current function, as it was at the start
    of the routine.  */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 27a522e..c50cbce 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2019-06-18  Jiangning Liu  <jiangning.liu@os.amperecomputing.com>
+
+	* gcc.dg/ifcvt-6.c: New test.
+
 2019-06-17  Jakub Jelinek  <jakub@redhat.com>
 
 	* gcc.dg/vect/vect-simd-8.c: New test.
diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
new file mode 100644
index 0000000..0c9dac42
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-fdump-rtl-ce1 -O2" } */
+
+unsigned test_ifcvt (unsigned k, unsigned b) {
+        unsigned a[2];
+        if (b < a[k]) {
+                a[k] = b;
+        }
+        return a[0]+a[1];
+}
+
+/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_try_cmove_arith" "ce1" } } */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-06-20  9:53   ` JiangNing OS
@ 2019-06-20 23:13     ` Jeff Law
  2019-06-21 12:24       ` Richard Biener
  0 siblings, 1 reply; 15+ messages in thread
From: Jeff Law @ 2019-06-20 23:13 UTC (permalink / raw)
  To: JiangNing OS, gcc-patches

On 6/20/19 3:53 AM, JiangNing OS wrote:
> Hi Jeff,
> 
> Appreciate your effort to review my patch! I've updated my patch as attached. See my answers below.
> 
>> in current function, so the store speculation can be avoided.
>> So at a high level should we be doing this in gimple rather than RTL?
>> We're going to have a lot more information about types, better
>> infrastructure for looking at uses/defs, access to the alias oracle, we should
>> be able to accurately distinguish between potentially shared objects vs those
>> which are local to the thread, etc.  We lose the low level costing information
>> though.
>>
>> I'm still going to go through the patch and do some level of review, but I do
>> think we need to answer the higher level question though.
>>
> I have the following reasons,
> 
> 1) Following the clue Richard B gave me before about parameter --param allow-store-data-races,
> I did check the middle-end pass tree-if-conv, but I think this pass at the moment doesn't work
> for the issue I'm trying to solve. Tree-if-conv is to do if conversion for loop, and its final goal is to
> help loop vectorization, while my case doesn't have a loop at all. 
I think the fact that it's focused so much on loops is a historical
accident.  We certainly have a variety of places in the gimple
optimizers that do if-conversion, and they're not all in tree-if-conv :(
 For example, some are done in tree-ssa-phiopt.

In the gimple optimizers the testcase from 89430 is going to look
something like this:


> ;   basic block 2, loop depth 0, count 1073741824 (estimated locally), maybe hot
> ;;    prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED)
> ;;    pred:       ENTRY [always]  count:1073741824 (estimated locally) (FALLTHRU,EXECUTABLE)
>   a.0_1 = a;
>   _2 = (long unsigned int) k_8(D);
>   _3 = _2 * 4;
>   _4 = a.0_1 + _3;
>   _5 = *_4;
>   if (_5 > b_9(D))
>     goto <bb 3>; [50.00%]
>   else
>     goto <bb 4>; [50.00%]
> ;;    succ:       3 [50.0% (guessed)]  count:536870912 (estimated locally) (TRUE_VALUE,EXECUTABLE)
> ;;                4 [50.0% (guessed)]  count:536870912 (estimated locally) (FALSE_VALUE,EXECUTABLE)
> 
> ;;   basic block 3, loop depth 0, count 536870913 (estimated locally), maybe hot
> ;;    prev block 2, next block 4, flags: (NEW, REACHABLE, VISITED)
> ;;    pred:       2 [50.0% (guessed)]  count:536870912 (estimated locally) (TRUE_VALUE,EXECUTABLE)
>   *_4 = b_9(D);
> ;;    succ:       4 [always]  count:536870913 (estimated locally) (FALLTHRU,EXECUTABLE)
> 
> ;;   basic block 4, loop depth 0, count 1073741824 (estimated locally), maybe hot
> ;;    prev block 3, next block 1, flags: (NEW, REACHABLE, VISITED)
> ;;    pred:       3 [always]  count:536870913 (estimated locally) (FALLTHRU,EXECUTABLE)
> ;;                2 [50.0% (guessed)]  count:536870912 (estimated locally) (FALSE_VALUE,EXECUTABLE)
>   return;

That looks like a pretty easy form to analyze.  I'd suggest looking
through tree-ssa-phiopt.c closely.  There's several transformations in
there that share similarities with yours.










> 2) My current solution fits into current back-end if-conversion pass very well. I don't want to invent
> a new framework to solve this relatively small issue. Besides, this back-end patch doesn't only
> enhance store speculation detection, but also fix a bug in the original code. 
Understood, but I still wonder if we're better off addressing this in
gimple.


>> Just from a design standpoint, what are the consequences if this function
>> returns true for something that isn't actually in the stack or false for
>> something that is on the stack?
>>
> If noce_mem_is_on_stack returns true for something that isn't actually in the stack, 
> it could potentially introduce store speculation, then the if-conversion optimization
> will be incorrect. If this function returns false for something that is on stack, it doesn't
> matter, because the optimization will not be triggered. 
OK.  That's what I expected.
> 
> 
> 
> 
>>
>>> +
>>> +/* Always return true, if there is a dominating write.
>>> +
>>> +   When there is a dominating read from memory on stack,
>>> +   1) if x = a is a memory read, return true.
>>> +   2) if x = a is a memory write, return true if the memory is on stack.
>>> +      This is the guarantee the memory is *not* readonly. */
>>> +
>>> +static bool
>>> +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
>>> +                           const_rtx x, bool is_store) {
>>> +  rtx_insn *insn;
>>> +  rtx set;
>>> +
>>> +  gcc_assert (MEM_P (x));
>>> +
>>> +  FOR_BB_INSNS (bb, insn)
>>> +    {
>>> +      set = single_set (insn);
>>> +      if (!set)
>>> +        continue;
>>> +
>>> +      /* Dominating store */
>>> +      if (rtx_equal_p (x, SET_DEST (set)))
>>> +        return true;
>>> +
>>> +      /* Dominating load */
>>> +      if (rtx_equal_p (x, SET_SRC (set)))
>>> +        if (is_store && noce_mem_is_on_stack (a_insn, x))
>>> +          return true;
>>> +    }
>>> +
>>> +  return false;
>>> +}
>> So what would be the consequences here of returning false when in fact
>> there was a dominating read or write?  That could easily happen if the
>> dominating read or write was not a single_set.
> If return false when in fact there is a dominating read or write, the optimization will not
> be triggered, because noce_mem_maybe_invalid_p will be true, which is playing the same
> role as may_trap_or_fault_p.
> 
>>
>> I'm guessing that from a design standpoint you're trying to find cases where
>> you know an object was written to within the block and does not escape.  So
>> returning false when there was a dominating write is safe.
>> Returning true when there was not would be disastrous.  Right?
>>
> YES! You've completely understood my code! ðŸ˜Š
So in this context, only handling single_sets is safe.  Perfect.


> 
>>
>>
>>
>>> +
>>> +  /* Iterate all insns until finding all stack pointers, and store
>>> +     them into bba_sets_must_be_sfp. */  bba_sets_must_be_sfp =
>>> + BITMAP_ALLOC (&reg_obstack);  do
>>> +    {
>>> +      sfp_found = false;
>>> +      FOR_ALL_BB_FN (bb, cfun)
>>> +        FOR_BB_INSNS (bb, a_insn)
>>> +          {
>>> +            rtx sset_a = single_set (a_insn);
>>> +            if (!sset_a)
>>> +              continue;
>>> +            rtx src = SET_SRC (sset_a);
>>> +            rtx dest = SET_DEST (sset_a);
>> So again, we can get things that aren't a single_set here and they'll be
>> ignored.  But I believe that's safe as you're trying to identify insns which store
>> stack addresses into a REG.  Missing a more complicated case where a stack
>> address is stored into a REG is safe.  Right?
>>
> Yes. Missing some pointers to stack is safe here, because we will use it to check if a memory
> in if-conversion candidate doesn't have store speculation. If we miss it, the optimization will
> not be triggered, so there will not be risky.
Thanks for the confirmation.

>>
>>> +
>>> +            /* Handle case like "x2 = x1 + offset", in which x1 is already
>>> +               identified as a stack pointer. */
>>> +            if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
>>> +                && CONST_INT_P (XEXP (src, 1)))
>>> +              {
>>> +                rtx x1 = XEXP (src, 0);
>>> +                if (!REG_P (x1))
>>> +                  continue;
>>> +
>>> +                FOR_EACH_INSN_USE (use, a_insn)
>>> +                  {
>>> +                    if (!rtx_equal_p (x1, DF_REF_REG (use)))
>>> +                      continue;
>>> +
>>> +                    if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO
>> (use)))
>>> +                      {
>>> +                        bitmap_set_bit (bba_sets_must_be_sfp, dest_regno);
>>> +                        sfp_found = true;
>>> +                        break;
>>> +                      }
>>> +                  }
>> So how much is there to be gained from going from this kind of localized
>> computation to global?  It shouldn't be hard to turn this into a truly global
>> algorithm, but I'm not sure it's worth the effort.
>>
>> I _do_ think it's worth visiting blocks in dominator order.  It's better than
>> what you're doing, but nowhere near as expensive as a global propagation
>> engine.
>>
> Fixed in attached new patch using dom_walk class. Please review again.
> 
>> Is it worth handling simple copies in the code above?
> I thought it less likely to have a copy for pointer to stack before register allocation. Right?
We certainly try to eliminate as many copies as possible, but experience
would tell us that they often sneak through.  I guess some quick
instrumentation would tell us if it's worth the effort.




> 
>>
>>> +/* Return true if current function doesn't pass stack address out of
>>> +frame. */
>>> +
>>> +static bool
>>> +no_stack_address_taken (void)
>>> +{
>>> +  basic_block bb;
>>> +  rtx_insn *a_insn;
>>> +  df_ref use;
>>> +
>>> +  FOR_ALL_BB_FN (bb, cfun)
>>> +    FOR_BB_INSNS (bb, a_insn)
>>> +      {
>>> +        if (!INSN_P (a_insn))
>>> +          continue;
>>> +
>>> +        rtx sset_a = single_set (a_insn);
>>> +        if (!sset_a)
>>> +          continue;
>>> +        rtx src = SET_SRC (sset_a);
>>> +        rtx dest = SET_DEST (sset_a);
>>> +
>>> +        /* Skip insn that is already identified as frame pointers. */
>>> +        if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
>>> +          continue;
>>> +
>>> +        FOR_EACH_INSN_USE (use, a_insn)
>>> +          {
>>> +            /* Check if current insn is using any stack pointer. Ignore
>>> +               if it doesn't use frame pointers at all. */
>>> +            if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use)))
>>> +              continue;
>>> +
>>> +            /* Load cannot introduce address taken. */
>>> +            if (MEM_P (src))
>>> +              continue;
>>> +
>>> +            /* Store is safe if only the src doesn't contain stack pointer. */
>>> +            if (MEM_P (dest)
>>> +                && !reg_mentioned_p (DF_REF_REG (use), src))
>>> +              continue;
>>> +
>>> +            /* All other cases potentially introduce address taken. */
>>> +            return false;
>>> +          }
>> So what if you have a PARALLEL where one entry causes an escape of a
>> pointer into the stack and another entry does some other operation?  If so,
>> don't you miss the fact that you've got an escaping pointer into the stack?  I
>> don't think you can restrict yourself to single_sets here.
> Yes. You are right. I have added code to handle PARALLEL case. I handle it in a rather conservative way though. Review again, please!
I'll take another look at the patch.   I'll also do a little
experimentation to see if we're seeing enough copies to make handling
them worth the effort.

While I'm doing that, if you could walk through tree-ssa-phiopt.c and
ponder if your transformation could fit into that framework it'd be
appreciated (note the comment at the start of the file only covers one
class of transformations, there are others later).

jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-06-20 23:13     ` Jeff Law
@ 2019-06-21 12:24       ` Richard Biener
  2019-06-21 16:10         ` Jeff Law
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Biener @ 2019-06-21 12:24 UTC (permalink / raw)
  To: Jeff Law; +Cc: JiangNing OS, gcc-patches

On Fri, Jun 21, 2019 at 1:13 AM Jeff Law <law@redhat.com> wrote:

> On 6/20/19 3:53 AM, JiangNing OS wrote:
> > Hi Jeff,
> >
> > Appreciate your effort to review my patch! I've updated my patch as
> attached. See my answers below.
> >
> >> in current function, so the store speculation can be avoided.
> >> So at a high level should we be doing this in gimple rather than RTL?
> >> We're going to have a lot more information about types, better
> >> infrastructure for looking at uses/defs, access to the alias oracle, we
> should
> >> be able to accurately distinguish between potentially shared objects vs
> those
> >> which are local to the thread, etc.  We lose the low level costing
> information
> >> though.
> >>
> >> I'm still going to go through the patch and do some level of review,
> but I do
> >> think we need to answer the higher level question though.
> >>
> > I have the following reasons,
> >
> > 1) Following the clue Richard B gave me before about parameter --param
> allow-store-data-races,
> > I did check the middle-end pass tree-if-conv, but I think this pass at
> the moment doesn't work
> > for the issue I'm trying to solve. Tree-if-conv is to do if conversion
> for loop, and its final goal is to
> > help loop vectorization, while my case doesn't have a loop at all.
> I think the fact that it's focused so much on loops is a historical
> accident.  We certainly have a variety of places in the gimple
> optimizers that do if-conversion, and they're not all in tree-if-conv :(
>  For example, some are done in tree-ssa-phiopt.
>
> In the gimple optimizers the testcase from 89430 is going to look
> something like this:
>
>
> > ;   basic block 2, loop depth 0, count 1073741824 (estimated locally),
> maybe hot
> > ;;    prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED)
> > ;;    pred:       ENTRY [always]  count:1073741824 (estimated locally)
> (FALLTHRU,EXECUTABLE)
> >   a.0_1 = a;
> >   _2 = (long unsigned int) k_8(D);
> >   _3 = _2 * 4;
> >   _4 = a.0_1 + _3;
> >   _5 = *_4;
> >   if (_5 > b_9(D))
> >     goto <bb 3>; [50.00%]
> >   else
> >     goto <bb 4>; [50.00%]
> > ;;    succ:       3 [50.0% (guessed)]  count:536870912 (estimated
> locally) (TRUE_VALUE,EXECUTABLE)
> > ;;                4 [50.0% (guessed)]  count:536870912 (estimated
> locally) (FALSE_VALUE,EXECUTABLE)
> >
> > ;;   basic block 3, loop depth 0, count 536870913 (estimated locally),
> maybe hot
> > ;;    prev block 2, next block 4, flags: (NEW, REACHABLE, VISITED)
> > ;;    pred:       2 [50.0% (guessed)]  count:536870912 (estimated
> locally) (TRUE_VALUE,EXECUTABLE)
> >   *_4 = b_9(D);
> > ;;    succ:       4 [always]  count:536870913 (estimated locally)
> (FALLTHRU,EXECUTABLE)
> >
> > ;;   basic block 4, loop depth 0, count 1073741824 (estimated locally),
> maybe hot
> > ;;    prev block 3, next block 1, flags: (NEW, REACHABLE, VISITED)
> > ;;    pred:       3 [always]  count:536870913 (estimated locally)
> (FALLTHRU,EXECUTABLE)
> > ;;                2 [50.0% (guessed)]  count:536870912 (estimated
> locally) (FALSE_VALUE,EXECUTABLE)
> >   return;
>
> That looks like a pretty easy form to analyze.  I'd suggest looking
> through tree-ssa-phiopt.c closely.  There's several transformations in
> there that share similarities with yours.
>

More specifically there is the conditional store elimination (cselim) pass
inside this file.


>
>
>
>
>
>
>
>
> > 2) My current solution fits into current back-end if-conversion pass
> very well. I don't want to invent
> > a new framework to solve this relatively small issue. Besides, this
> back-end patch doesn't only
> > enhance store speculation detection, but also fix a bug in the original
> code.
> Understood, but I still wonder if we're better off addressing this in
> gimple.
>

Traditionally if-conversions profitability heavily depends on the target
and esp.
if memory is involved costing on GIMPLE is hard.  That's also one reason why
we turned off loop if-conversion on GIMPLE for non-vectorized code.

phiopt is really about followup optimization opportunities in passes that do
not handle conditionals well.

cselim is on the border...


> >> Just from a design standpoint, what are the consequences if this
> function
> >> returns true for something that isn't actually in the stack or false for
> >> something that is on the stack?
> >>
> > If noce_mem_is_on_stack returns true for something that isn't actually
> in the stack,
> > it could potentially introduce store speculation, then the if-conversion
> optimization
> > will be incorrect. If this function returns false for something that is
> on stack, it doesn't
> > matter, because the optimization will not be triggered.
> OK.  That's what I expected.
> >
> >
> >
> >
> >>
> >>> +
> >>> +/* Always return true, if there is a dominating write.
> >>> +
> >>> +   When there is a dominating read from memory on stack,
> >>> +   1) if x = a is a memory read, return true.
> >>> +   2) if x = a is a memory write, return true if the memory is on
> stack.
> >>> +      This is the guarantee the memory is *not* readonly. */
> >>> +
> >>> +static bool
> >>> +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn,
> >>> +                           const_rtx x, bool is_store) {
> >>> +  rtx_insn *insn;
> >>> +  rtx set;
> >>> +
> >>> +  gcc_assert (MEM_P (x));
> >>> +
> >>> +  FOR_BB_INSNS (bb, insn)
> >>> +    {
> >>> +      set = single_set (insn);
> >>> +      if (!set)
> >>> +        continue;
> >>> +
> >>> +      /* Dominating store */
> >>> +      if (rtx_equal_p (x, SET_DEST (set)))
> >>> +        return true;
> >>> +
> >>> +      /* Dominating load */
> >>> +      if (rtx_equal_p (x, SET_SRC (set)))
> >>> +        if (is_store && noce_mem_is_on_stack (a_insn, x))
> >>> +          return true;
> >>> +    }
> >>> +
> >>> +  return false;
> >>> +}
> >> So what would be the consequences here of returning false when in fact
> >> there was a dominating read or write?  That could easily happen if the
> >> dominating read or write was not a single_set.
> > If return false when in fact there is a dominating read or write, the
> optimization will not
> > be triggered, because noce_mem_maybe_invalid_p will be true, which is
> playing the same
> > role as may_trap_or_fault_p.
> >
> >>
> >> I'm guessing that from a design standpoint you're trying to find cases
> where
> >> you know an object was written to within the block and does not
> escape.  So
> >> returning false when there was a dominating write is safe.
> >> Returning true when there was not would be disastrous.  Right?
> >>
> > YES! You've completely understood my code! 😊
> So in this context, only handling single_sets is safe.  Perfect.
>
>
> >
> >>
> >>
> >>
> >>> +
> >>> +  /* Iterate all insns until finding all stack pointers, and store
> >>> +     them into bba_sets_must_be_sfp. */  bba_sets_must_be_sfp =
> >>> + BITMAP_ALLOC (&reg_obstack);  do
> >>> +    {
> >>> +      sfp_found = false;
> >>> +      FOR_ALL_BB_FN (bb, cfun)
> >>> +        FOR_BB_INSNS (bb, a_insn)
> >>> +          {
> >>> +            rtx sset_a = single_set (a_insn);
> >>> +            if (!sset_a)
> >>> +              continue;
> >>> +            rtx src = SET_SRC (sset_a);
> >>> +            rtx dest = SET_DEST (sset_a);
> >> So again, we can get things that aren't a single_set here and they'll be
> >> ignored.  But I believe that's safe as you're trying to identify insns
> which store
> >> stack addresses into a REG.  Missing a more complicated case where a
> stack
> >> address is stored into a REG is safe.  Right?
> >>
> > Yes. Missing some pointers to stack is safe here, because we will use it
> to check if a memory
> > in if-conversion candidate doesn't have store speculation. If we miss
> it, the optimization will
> > not be triggered, so there will not be risky.
> Thanks for the confirmation.
>
> >>
> >>> +
> >>> +            /* Handle case like "x2 = x1 + offset", in which x1 is
> already
> >>> +               identified as a stack pointer. */
> >>> +            if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
> >>> +                && CONST_INT_P (XEXP (src, 1)))
> >>> +              {
> >>> +                rtx x1 = XEXP (src, 0);
> >>> +                if (!REG_P (x1))
> >>> +                  continue;
> >>> +
> >>> +                FOR_EACH_INSN_USE (use, a_insn)
> >>> +                  {
> >>> +                    if (!rtx_equal_p (x1, DF_REF_REG (use)))
> >>> +                      continue;
> >>> +
> >>> +                    if (bitmap_bit_p (bba_sets_must_be_sfp,
> DF_REF_REGNO
> >> (use)))
> >>> +                      {
> >>> +                        bitmap_set_bit (bba_sets_must_be_sfp,
> dest_regno);
> >>> +                        sfp_found = true;
> >>> +                        break;
> >>> +                      }
> >>> +                  }
> >> So how much is there to be gained from going from this kind of localized
> >> computation to global?  It shouldn't be hard to turn this into a truly
> global
> >> algorithm, but I'm not sure it's worth the effort.
> >>
> >> I _do_ think it's worth visiting blocks in dominator order.  It's
> better than
> >> what you're doing, but nowhere near as expensive as a global propagation
> >> engine.
> >>
> > Fixed in attached new patch using dom_walk class. Please review again.
> >
> >> Is it worth handling simple copies in the code above?
> > I thought it less likely to have a copy for pointer to stack before
> register allocation. Right?
> We certainly try to eliminate as many copies as possible, but experience
> would tell us that they often sneak through.  I guess some quick
> instrumentation would tell us if it's worth the effort.
>
>
>
>
> >
> >>
> >>> +/* Return true if current function doesn't pass stack address out of
> >>> +frame. */
> >>> +
> >>> +static bool
> >>> +no_stack_address_taken (void)
> >>> +{
> >>> +  basic_block bb;
> >>> +  rtx_insn *a_insn;
> >>> +  df_ref use;
> >>> +
> >>> +  FOR_ALL_BB_FN (bb, cfun)
> >>> +    FOR_BB_INSNS (bb, a_insn)
> >>> +      {
> >>> +        if (!INSN_P (a_insn))
> >>> +          continue;
> >>> +
> >>> +        rtx sset_a = single_set (a_insn);
> >>> +        if (!sset_a)
> >>> +          continue;
> >>> +        rtx src = SET_SRC (sset_a);
> >>> +        rtx dest = SET_DEST (sset_a);
> >>> +
> >>> +        /* Skip insn that is already identified as frame pointers. */
> >>> +        if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest)))
> >>> +          continue;
> >>> +
> >>> +        FOR_EACH_INSN_USE (use, a_insn)
> >>> +          {
> >>> +            /* Check if current insn is using any stack pointer.
> Ignore
> >>> +               if it doesn't use frame pointers at all. */
> >>> +            if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO
> (use)))
> >>> +              continue;
> >>> +
> >>> +            /* Load cannot introduce address taken. */
> >>> +            if (MEM_P (src))
> >>> +              continue;
> >>> +
> >>> +            /* Store is safe if only the src doesn't contain stack
> pointer. */
> >>> +            if (MEM_P (dest)
> >>> +                && !reg_mentioned_p (DF_REF_REG (use), src))
> >>> +              continue;
> >>> +
> >>> +            /* All other cases potentially introduce address taken. */
> >>> +            return false;
> >>> +          }
> >> So what if you have a PARALLEL where one entry causes an escape of a
> >> pointer into the stack and another entry does some other operation?  If
> so,
> >> don't you miss the fact that you've got an escaping pointer into the
> stack?  I
> >> don't think you can restrict yourself to single_sets here.
> > Yes. You are right. I have added code to handle PARALLEL case. I handle
> it in a rather conservative way though. Review again, please!
> I'll take another look at the patch.   I'll also do a little
> experimentation to see if we're seeing enough copies to make handling
> them worth the effort.
>
> While I'm doing that, if you could walk through tree-ssa-phiopt.c and
> ponder if your transformation could fit into that framework it'd be
> appreciated (note the comment at the start of the file only covers one
> class of transformations, there are others later).
>
> jeff
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-06-21 12:24       ` Richard Biener
@ 2019-06-21 16:10         ` Jeff Law
  2019-07-08  7:43           ` JiangNing OS
  0 siblings, 1 reply; 15+ messages in thread
From: Jeff Law @ 2019-06-21 16:10 UTC (permalink / raw)
  To: Richard Biener; +Cc: JiangNing OS, gcc-patches

On 6/21/19 6:23 AM, Richard Biener wrote:
> 
> That looks like a pretty easy form to analyze.  I'd suggest looking 
> through tree-ssa-phiopt.c closely.  There's several transformations
> in there that share similarities with yours.
> 
> 
> More specifically there is the conditional store elimination (cselim)
> pass inside this file.
That's the one I was thinking of.  It looks reasonably close to the
cases JiangNing is trying to capture.


> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
>> 2) My current solution fits into current back-end if-conversion
> pass very well. I don't want to invent
>> a new framework to solve this relatively small issue. Besides,
> this back-end patch doesn't only
>> enhance store speculation detection, but also fix a bug in the
> original code. Understood, but I still wonder if we're better off
> addressing this in gimple.
> 
> 
> Traditionally if-conversions profitability heavily depends on the
> target and esp. if memory is involved costing on GIMPLE is hard.
> That's also one reason why we turned off loop if-conversion on GIMPLE
> for non-vectorized code.
Yea.  That's always the concern for transformations that aren't trivial
to show are better.

Conditional store elimination as implemented right now in phiopt
requires a single store in the then/else clauses.  So we're really just
sinking the stores through a PHI.  We're really not changing the number
of loads or stores on any path.

In the cases I think JiangNing is trying to handle we are adding a store
on a path where it didn't previously exist because there is no else
clause.  So we likely need to check the edge probabilities to avoid
doing something dumb.  I don't have a good sense where the cutoff should
be.  tree-ssa-sink might have some nuggets of information to help guide.

> 
> phiopt is really about followup optimization opportunities in passes
> that do not handle conditionals well.
> 
> cselim is on the border...
ACK.  In fact, looking at it it I wonder if it'd be better in
tree-ssa-sink.c :-)


jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-06-21 16:10         ` Jeff Law
@ 2019-07-08  7:43           ` JiangNing OS
  2019-07-12  5:22             ` Andrew Pinski
  2019-07-12 16:40             ` Jeff Law
  0 siblings, 2 replies; 15+ messages in thread
From: JiangNing OS @ 2019-07-08  7:43 UTC (permalink / raw)
  To: Jeff Law, Richard Biener; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2682 bytes --]

Hi Jeff and Richard B.,

Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please!

Thanks a lot!
-Jiangning

> -----Original Message-----
> From: Jeff Law <law@redhat.com>
> Sent: Saturday, June 22, 2019 12:10 AM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-
> patches@gcc.gnu.org
> Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
> 
> On 6/21/19 6:23 AM, Richard Biener wrote:
> >
> > That looks like a pretty easy form to analyze.  I'd suggest looking
> > through tree-ssa-phiopt.c closely.  There's several transformations in
> > there that share similarities with yours.
> >
> >
> > More specifically there is the conditional store elimination (cselim)
> > pass inside this file.
> That's the one I was thinking of.  It looks reasonably close to the cases
> JiangNing is trying to capture.
> 
> 
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >> 2) My current solution fits into current back-end if-conversion
> > pass very well. I don't want to invent
> >> a new framework to solve this relatively small issue. Besides,
> > this back-end patch doesn't only
> >> enhance store speculation detection, but also fix a bug in the
> > original code. Understood, but I still wonder if we're better off
> > addressing this in gimple.
> >
> >
> > Traditionally if-conversions profitability heavily depends on the
> > target and esp. if memory is involved costing on GIMPLE is hard.
> > That's also one reason why we turned off loop if-conversion on GIMPLE
> > for non-vectorized code.
> Yea.  That's always the concern for transformations that aren't trivial to show
> are better.
> 
> Conditional store elimination as implemented right now in phiopt requires a
> single store in the then/else clauses.  So we're really just sinking the stores
> through a PHI.  We're really not changing the number of loads or stores on
> any path.
> 
> In the cases I think JiangNing is trying to handle we are adding a store on a
> path where it didn't previously exist because there is no else clause.  So we
> likely need to check the edge probabilities to avoid doing something dumb.  I
> don't have a good sense where the cutoff should be.  tree-ssa-sink might
> have some nuggets of information to help guide.
> 
> >
> > phiopt is really about followup optimization opportunities in passes
> > that do not handle conditionals well.
> >
> > cselim is on the border...
> ACK.  In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-)
> 
> 
> jeff

[-- Attachment #2: csel5.patch --]
[-- Type: application/octet-stream, Size: 6269 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f83f75b0bf..1a7acf7f1ba 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2019-07-08  Jiangning Liu  <jiangning.liu@amperecomputing.com>
+
+	PR tree-optimization/89430
+	* tree-ssa-phiopt.c (cond_store_replacement): Support conditional
+	store elimination for local variable without address escape.
+
 2019-07-07  Jeff Law  <law@redhat.com>
 
 	PR tree-optimization/91090
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 12e5bc167e0..65a34b43fb5 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,13 @@
+2019-07-08  Jiangning Liu  <jiangning.liu@amperecomputing.com>
+
+	PR tree-optimization/89430
+	* gcc.dg/tree-ssa/pr89430-1.c: New test.
+	* gcc.dg/tree-ssa/pr89430-2.c: New test.
+	* gcc.dg/tree-ssa/pr89430-3.c: New test.
+	* gcc.dg/tree-ssa/pr89430-4.c: New test.
+	* gcc.dg/tree-ssa/pr89430-5.c: New test.
+	* gcc.dg/tree-ssa/pr89430-6.c: New test.
+
 2019-07-07  Paul Thomas  <pault@gcc.gnu.org>
 
 	PR fortran/91077
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c
new file mode 100644
index 00000000000..8ee1850ac63
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cselim-details" } */
+
+unsigned test(unsigned k, unsigned b) {
+        unsigned a[2];
+        if (b < a[k]) {
+                a[k] = b;
+        }
+        return a[0]+a[1];
+}
+
+/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c
new file mode 100644
index 00000000000..9b96875ac7a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cselim-details" } */
+
+int c;
+unsigned test(unsigned k, unsigned b) {
+        unsigned a[2];
+	a[k] = c;
+        if (b < a[k]) {
+                a[k] = b;
+        }
+        return a[0]+a[1];
+}
+
+/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-3.c
new file mode 100644
index 00000000000..0fac9f9b9c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cselim-details" } */
+
+unsigned a[2];
+unsigned test(unsigned k, unsigned b) {
+        if (b < a[k]) {
+                a[k] = b;
+        }
+        return a[0]+a[1];
+}
+
+/* { dg-final { scan-tree-dump-not "Conditional store replacement" "cselim" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-4.c
new file mode 100644
index 00000000000..54b8c11a407
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-4.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cselim-details" } */
+
+int *p;
+unsigned test(unsigned k, unsigned b) {
+        unsigned a[2];
+	p = a;
+        if (b < a[k]) {
+                a[k] = b;
+        }
+        return a[0]+a[1];
+}
+
+/* { dg-final { scan-tree-dump-not "Conditional store replacement" "cselim" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c
new file mode 100644
index 00000000000..b2d04119381
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cselim-details" } */
+
+int test(int b, int k) {
+    struct {
+        int data[2];
+    } a;
+
+    if (b < a.data[k]) {
+        a.data[k] = b;
+    }
+
+    return a.data[0] + a.data[1];
+}
+
+/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c
new file mode 100644
index 00000000000..8d3c4f7cc6a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cselim-details" } */
+
+int test(int b, int k) {
+    typedef struct {
+	    int x;
+    } SS;
+    struct {
+        SS data[2];
+    } a;
+
+    if (b < a.data[k].x) {
+        a.data[k].x = b;
+    }
+
+    return a.data[0].x + a.data[1].x;
+}
+
+/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */
diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
index 7088ff91998..2678f58be74 100644
--- a/gcc/tree-ssa-phiopt.c
+++ b/gcc/tree-ssa-phiopt.c
@@ -2218,8 +2218,9 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb,
   locus = gimple_location (assign);
   lhs = gimple_assign_lhs (assign);
   rhs = gimple_assign_rhs1 (assign);
-  if (TREE_CODE (lhs) != MEM_REF
-      || TREE_CODE (TREE_OPERAND (lhs, 0)) != SSA_NAME
+  if ((TREE_CODE (lhs) != MEM_REF
+       && TREE_CODE (lhs) != ARRAY_REF
+       && TREE_CODE (lhs) != COMPONENT_REF)
       || !is_gimple_reg_type (TREE_TYPE (lhs)))
     return false;
 
@@ -2227,7 +2228,13 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb,
      TREE_THIS_NOTRAP here, but in that case we also could move stores,
      whose value is not available readily, which we want to avoid.  */
   if (!nontrap->contains (lhs))
-    return false;
+    {
+      /* If LHS is a local variable without address-taken, we could
+	 always safely move down the store.  */
+      tree base = get_base_address (lhs);
+      if (!auto_var_p (base) || TREE_ADDRESSABLE (base))
+	return false;
+    }
 
   /* Now we've checked the constraints, so do the transformation:
      1) Remove the single store.  */
@@ -2280,6 +2287,14 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb,
   else
     gsi_insert_before (&gsi, new_stmt, GSI_NEW_STMT);
 
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nConditional store replacement happened!");
+      fprintf (dump_file, "\nReplaced the store with a load.");
+      fprintf (dump_file, "\nInserted a new PHI statement in joint block:\n");
+      print_gimple_stmt (dump_file, new_stmt, 0, TDF_VOPS|TDF_MEMSYMS);
+    }
+
   return true;
 }
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-07-08  7:43           ` JiangNing OS
@ 2019-07-12  5:22             ` Andrew Pinski
  2019-07-12 16:11               ` Jeff Law
  2019-07-12 16:40             ` Jeff Law
  1 sibling, 1 reply; 15+ messages in thread
From: Andrew Pinski @ 2019-07-12  5:22 UTC (permalink / raw)
  To: JiangNing OS; +Cc: Jeff Law, Richard Biener, gcc-patches

On Mon, Jul 8, 2019 at 12:39 AM JiangNing OS
<jiangning@os.amperecomputing.com> wrote:
>
> Hi Jeff and Richard B.,
>
> Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please!

  /* Prove that we can move the store down.  We could also check
     TREE_THIS_NOTRAP here, but in that case we also could move stores,
     whose value is not available readily, which we want to avoid.  */

I think the comment above the change needs to be changed or extended slightly.

Thanks,
Andrew Pinski

>
> Thanks a lot!
> -Jiangning
>
> > -----Original Message-----
> > From: Jeff Law <law@redhat.com>
> > Sent: Saturday, June 22, 2019 12:10 AM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-
> > patches@gcc.gnu.org
> > Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
> >
> > On 6/21/19 6:23 AM, Richard Biener wrote:
> > >
> > > That looks like a pretty easy form to analyze.  I'd suggest looking
> > > through tree-ssa-phiopt.c closely.  There's several transformations in
> > > there that share similarities with yours.
> > >
> > >
> > > More specifically there is the conditional store elimination (cselim)
> > > pass inside this file.
> > That's the one I was thinking of.  It looks reasonably close to the cases
> > JiangNing is trying to capture.
> >
> >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >> 2) My current solution fits into current back-end if-conversion
> > > pass very well. I don't want to invent
> > >> a new framework to solve this relatively small issue. Besides,
> > > this back-end patch doesn't only
> > >> enhance store speculation detection, but also fix a bug in the
> > > original code. Understood, but I still wonder if we're better off
> > > addressing this in gimple.
> > >
> > >
> > > Traditionally if-conversions profitability heavily depends on the
> > > target and esp. if memory is involved costing on GIMPLE is hard.
> > > That's also one reason why we turned off loop if-conversion on GIMPLE
> > > for non-vectorized code.
> > Yea.  That's always the concern for transformations that aren't trivial to show
> > are better.
> >
> > Conditional store elimination as implemented right now in phiopt requires a
> > single store in the then/else clauses.  So we're really just sinking the stores
> > through a PHI.  We're really not changing the number of loads or stores on
> > any path.
> >
> > In the cases I think JiangNing is trying to handle we are adding a store on a
> > path where it didn't previously exist because there is no else clause.  So we
> > likely need to check the edge probabilities to avoid doing something dumb.  I
> > don't have a good sense where the cutoff should be.  tree-ssa-sink might
> > have some nuggets of information to help guide.
> >
> > >
> > > phiopt is really about followup optimization opportunities in passes
> > > that do not handle conditionals well.
> > >
> > > cselim is on the border...
> > ACK.  In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-)
> >
> >
> > jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-07-12  5:22             ` Andrew Pinski
@ 2019-07-12 16:11               ` Jeff Law
  0 siblings, 0 replies; 15+ messages in thread
From: Jeff Law @ 2019-07-12 16:11 UTC (permalink / raw)
  To: Andrew Pinski, JiangNing OS; +Cc: Richard Biener, gcc-patches

On 7/11/19 10:08 PM, Andrew Pinski wrote:
> On Mon, Jul 8, 2019 at 12:39 AM JiangNing OS
> <jiangning@os.amperecomputing.com> wrote:
>>
>> Hi Jeff and Richard B.,
>>
>> Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please!
> 
>   /* Prove that we can move the store down.  We could also check
>      TREE_THIS_NOTRAP here, but in that case we also could move stores,
>      whose value is not available readily, which we want to avoid.  */
> 
> I think the comment above the change needs to be changed or extended slightly.
Actually, I don't think that comment needs to change.  But the one
before cond_store_replacement needs minor updating which I'll take care of.

My reasoning is that addressables in the local stack are most likely
going to already have their lines in the cache and thus are available
readily.


Jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
  2019-07-08  7:43           ` JiangNing OS
  2019-07-12  5:22             ` Andrew Pinski
@ 2019-07-12 16:40             ` Jeff Law
  1 sibling, 0 replies; 15+ messages in thread
From: Jeff Law @ 2019-07-12 16:40 UTC (permalink / raw)
  To: JiangNing OS, Richard Biener; +Cc: gcc-patches

On 7/8/19 1:39 AM, JiangNing OS wrote:
> Hi Jeff and Richard B.,
> 
> Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please!
> 
> Thanks a lot!
> -Jiangning
> 
>> -----Original Message-----
>> From: Jeff Law <law@redhat.com>
>> Sent: Saturday, June 22, 2019 12:10 AM
>> To: Richard Biener <richard.guenther@gmail.com>
>> Cc: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-
>> patches@gcc.gnu.org
>> Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430)
>>
>> On 6/21/19 6:23 AM, Richard Biener wrote:
>>> That looks like a pretty easy form to analyze.  I'd suggest looking
>>> through tree-ssa-phiopt.c closely.  There's several transformations in
>>> there that share similarities with yours.
>>>
>>>
>>> More specifically there is the conditional store elimination (cselim)
>>> pass inside this file.
>> That's the one I was thinking of.  It looks reasonably close to the cases
>> JiangNing is trying to capture.
>>
>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>> 2) My current solution fits into current back-end if-conversion
>>> pass very well. I don't want to invent
>>>> a new framework to solve this relatively small issue. Besides,
>>> this back-end patch doesn't only
>>>> enhance store speculation detection, but also fix a bug in the
>>> original code. Understood, but I still wonder if we're better off
>>> addressing this in gimple.
>>>
>>>
>>> Traditionally if-conversions profitability heavily depends on the
>>> target and esp. if memory is involved costing on GIMPLE is hard.
>>> That's also one reason why we turned off loop if-conversion on GIMPLE
>>> for non-vectorized code.
>> Yea.  That's always the concern for transformations that aren't trivial to show
>> are better.
>>
>> Conditional store elimination as implemented right now in phiopt requires a
>> single store in the then/else clauses.  So we're really just sinking the stores
>> through a PHI.  We're really not changing the number of loads or stores on
>> any path.
>>
>> In the cases I think JiangNing is trying to handle we are adding a store on a
>> path where it didn't previously exist because there is no else clause.  So we
>> likely need to check the edge probabilities to avoid doing something dumb.  I
>> don't have a good sense where the cutoff should be.  tree-ssa-sink might
>> have some nuggets of information to help guide.
>>
>>> phiopt is really about followup optimization opportunities in passes
>>> that do not handle conditionals well.
>>>
>>> cselim is on the border...
>> ACK.  In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-)
>>
>>
>> jeff
> 
> csel5.patch
> 
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 9f83f75b0bf..1a7acf7f1ba 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,9 @@
> +2019-07-08  Jiangning Liu  <jiangning.liu@amperecomputing.com>
> +
> +	PR tree-optimization/89430
> +	* tree-ssa-phiopt.c (cond_store_replacement): Support conditional
> +	store elimination for local variable without address escape.
> +
>  2019-07-07  Jeff Law  <law@redhat.com>
>  
>  	PR tree-optimization/91090
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 12e5bc167e0..65a34b43fb5 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,13 @@
> +2019-07-08  Jiangning Liu  <jiangning.liu@amperecomputing.com>
> +
> +	PR tree-optimization/89430
> +	* gcc.dg/tree-ssa/pr89430-1.c: New test.
> +	* gcc.dg/tree-ssa/pr89430-2.c: New test.
> +	* gcc.dg/tree-ssa/pr89430-3.c: New test.
> +	* gcc.dg/tree-ssa/pr89430-4.c: New test.
> +	* gcc.dg/tree-ssa/pr89430-5.c: New test.
> +	* gcc.dg/tree-ssa/pr89430-6.c: New test.
THanks!

I made a trivial update to the comment before cond_store_replacement and
installed this patch on the trunk.

jeff

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2019-07-12 16:29 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-15  8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS
2019-03-18 21:53 ` Jeff Law
2019-04-30 16:00 ` Jeff Law
2019-05-05  0:11   ` JiangNing OS
2019-05-24 14:54 ` Kyrill Tkachov
2019-06-17  0:36   ` JiangNing OS
2019-06-17 21:59 ` Jeff Law
2019-06-20  9:53   ` JiangNing OS
2019-06-20 23:13     ` Jeff Law
2019-06-21 12:24       ` Richard Biener
2019-06-21 16:10         ` Jeff Law
2019-07-08  7:43           ` JiangNing OS
2019-07-12  5:22             ` Andrew Pinski
2019-07-12 16:11               ` Jeff Law
2019-07-12 16:40             ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).