using scratchpads to enhance RTL-level if-conversion: revised patch

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* using scratchpads to enhance RTL-level if-conversion: revised patch
@ 2015-10-13 20:05 Abe
  0 siblings, 0 replies; 19+ messages in thread
From: Abe @ 2015-10-13 20:05 UTC (permalink / raw)
  To: gcc-patches, Sebastian Pop, Kyrill Tkachov, Bernd Schmidt

[-- Attachment #1: Type: text/plain, Size: 215 bytes --]

Attached please find my revised patch to enhance RTL-level if-conversion
using scratchpads, newly optimizing only half-hammock stores for now.

Bootstrapped and regression-tested on x86_64 GNU/Linux.

Regards,

Abe

[-- Attachment #2: Abe_RTL_ifcvt_patch_1_v007.2___2015_Oct._13_.patch --]
[-- Type: text/plain, Size: 6705 bytes --]

--- ifcvt.c	2015-09-01 12:54:38.234108158 -0500
+++ ifcvt.c	2015-10-13 14:22:27.935204461 -0500
@@ -56,6 +56,8 @@
 #include "rtl-iter.h"
 #include "ifcvt.h"
 
+#include <utility>
+
 #ifndef MAX_CONDITIONAL_EXECUTE
 #define MAX_CONDITIONAL_EXECUTE \
   (BRANCH_COST (optimize_function_for_speed_p (cfun), false) \
@@ -66,6 +68,9 @@
 
 #define NULL_BLOCK	((basic_block) NULL)
 
+/* An arbitrary inclusive maximum size (in bytes) for each scratchpad.  */
+#define SCRATCHPAD_MAX_SIZE 128
+
 /* True if after combine pass.  */
 static bool ifcvt_after_combine;
 
@@ -110,6 +115,8 @@ static int dead_or_predicable (basic_blo
 			       edge, int);
 static void noce_emit_move_insn (rtx, rtx);
 static rtx_insn *block_has_only_trap (basic_block);
+
+static auto_vec<std::pair<rtx, unsigned int> > scratchpads;
 \f
 /* Count the number of non-jump active insns in BB.  */
 
@@ -2784,19 +2791,16 @@ noce_operand_ok (const_rtx op)
   return ! may_trap_p (op);
 }
 
-/* Return true if a write into MEM may trap or fault.  */
+/* Return true if a write into MEM may trap or fault
+   even in the presence of scratchpad support.  */
 
 static bool
-noce_mem_write_may_trap_or_fault_p (const_rtx mem)
+noce_mem_write_may_trap_or_fault_p_1 (const_rtx mem)
 {
-  rtx addr;
-
   if (MEM_READONLY_P (mem))
     return true;
 
-  if (may_trap_or_fault_p (mem))
-    return true;
-
+  rtx addr;
   addr = XEXP (mem, 0);
 
   /* Call target hook to avoid the effects of -fpic etc....  */
@@ -2837,6 +2841,18 @@ noce_mem_write_may_trap_or_fault_p (cons
   return false;
 }
 
+/* Return true if a write into MEM may trap or fault
+   without scratchpad support.  */
+
+static bool
+noce_mem_write_may_trap_or_fault_p (const_rtx mem)
+{
+  if (may_trap_or_fault_p (mem))
+    return true;
+
+  return noce_mem_write_may_trap_or_fault_p_1 (mem);
+}
+
 /* Return whether we can use store speculation for MEM.  TOP_BB is the
    basic block above the conditional block where we are considering
    doing the speculative store.  We look for whether MEM is set
@@ -3156,17 +3172,116 @@ noce_process_if_block (struct noce_if_in
 
   if (!set_b && MEM_P (orig_x))
     {
-      /* Disallow the "if (...) x = a;" form (implicit "else x = x;")
-	 for optimizations if writing to x may trap or fault,
-	 i.e. it's a memory other than a static var or a stack slot,
-	 is misaligned on strict aligned machines or is read-only.  If
-	 x is a read-only memory, then the program is valid only if we
-	 avoid the store into it.  If there are stores on both the
-	 THEN and ELSE arms, then we can go ahead with the conversion;
-	 either the program is broken, or the condition is always
-	 false such that the other memory is selected.  */
+      /* Disallow the "if (...) x = a;" form (with no "else") for optimizations
+	 when x is misaligned on strict-alignment machines or is read-only.
+	 If x is a memory other than a static var or a stack slot: for targets
+	 _with_ conditional move and _without_ conditional execution,
+	 convert using the scratchpad technique, otherwise don`t convert.
+	 If x is a read-only memory, then the program is valid only if we avoid
+	 the store into it.  If there are stores on both the THEN and ELSE arms,
+	 then we can go ahead with the conversion; either the program is broken,
+	 or the condition is always false such that the other memory is selected.
+	 The non-scratchpad-based conversion here has an implicit "else x = x;".  */
       if (noce_mem_write_may_trap_or_fault_p (orig_x))
-	return FALSE;
+	{
+	  if (   reload_completed
+	      || optimize < 2
+	      || optimize_function_for_size_p (cfun)
+	      || targetm.have_conditional_execution ()
+	      || !HAVE_conditional_move
+	      || (CONSTANT_P (XEXP (cond, 0)) && CONSTANT_P (XEXP (cond, 1))))
+	    return FALSE;
+
+
+	  if (noce_mem_write_may_trap_or_fault_p_1 (orig_x)
+	      || !MEM_SIZE_KNOWN_P (orig_x))
+	    return FALSE;
+
+	  const size_t MEM_size = MEM_SIZE (orig_x);
+	  if (MEM_size > SCRATCHPAD_MAX_SIZE)
+	    return FALSE;
+
+	  rtx biggest_spad = 0;
+	  unsigned int biggest_spad_size = 0;
+	  if (size_t vec_len = scratchpads.length ())
+	    {
+	      std::pair<rtx, unsigned> tmp_pair = scratchpads[vec_len - 1];
+	      biggest_spad = tmp_pair.first;
+	      biggest_spad_size = tmp_pair.second;
+	    }
+	  if (MEM_size > biggest_spad_size)
+	    {
+	      biggest_spad_size = MEM_size;
+	      biggest_spad = assign_stack_local (GET_MODE (orig_x), MEM_size, 0);
+	      gcc_assert (biggest_spad);
+	      scratchpads.safe_push (std::make_pair (biggest_spad, MEM_size));
+	    }
+
+	  gcc_assert (biggest_spad);
+
+	  rtx reg_for_store_addr = gen_reg_rtx (Pmode);
+	  set_used_flags (reg_for_store_addr);
+
+	  start_sequence ();
+
+	  for (rtx_insn *insn = BB_HEAD (then_bb);
+	       insn && insn != insn_a && insn != BB_END (then_bb);
+	       insn = NEXT_INSN (insn))
+	    if (!(NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))
+	      duplicate_insn_chain (insn, insn);
+
+	  rtx target = noce_emit_cmove (if_info,
+					reg_for_store_addr,
+					GET_CODE (cond),
+					XEXP (cond, 0),
+					XEXP (cond, 1),
+					XEXP (orig_x, 0),
+					XEXP (biggest_spad, 0));
+
+	  if (!target)
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+	  if (target != reg_for_store_addr)
+	    noce_emit_move_insn (reg_for_store_addr, target);
+
+	  rtx mem = gen_rtx_MEM (GET_MODE (orig_x), reg_for_store_addr);
+	  MEM_NOTRAP_P (mem) = true;
+	  MEM_VOLATILE_P (mem) = MEM_VOLATILE_P (orig_x);
+
+	  alias_set_type temp_alias_set = new_alias_set ();
+	  if (MEM_ALIAS_SET (orig_x))
+	    record_alias_subset (MEM_ALIAS_SET (orig_x), temp_alias_set);
+	  set_mem_alias_set (mem, temp_alias_set);
+
+	  set_mem_align (mem, MIN (MEM_ALIGN (biggest_spad),
+				   MEM_ALIGN (orig_x)));
+	  if (MEM_ADDR_SPACE (orig_x) != MEM_ADDR_SPACE (biggest_spad))
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+
+	  set_used_flags (mem);
+
+	  noce_emit_move_insn (mem, a);
+
+	  rtx_insn *seq = end_ifcvt_sequence (if_info);
+	  if (!seq)
+	    return FALSE;
+
+	  unshare_all_rtl_in_chain (seq);
+
+	  /* Prevent the code right after "success:"
+	     from throwing away the changes.  */
+	  x = orig_x;
+
+	  emit_insn_before_setloc (seq, if_info->jump,
+				   INSN_LOCATION (if_info->insn_a));
+	  goto success;
+
+	}
 
       /* Avoid store speculation: given "if (...) x = a" where x is a
 	 MEM, we only want to do the store if x is always set
@@ -4959,6 +5074,9 @@ if_convert (bool after_combine)
   basic_block bb;
   int pass;
 
+  /* Ensure that we start the scratchpads data fresh each time.  */
+  scratchpads.truncate (0);
+
   if (optimize == 1)
     {
       df_live_add_problem ();

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-27 23:02 ` Abe
@ 2015-10-30 14:09   ` Bernd Schmidt
  0 siblings, 0 replies; 19+ messages in thread
From: Bernd Schmidt @ 2015-10-30 14:09 UTC (permalink / raw)
  To: Abe, gcc-patches, Sebastian Pop, Kyrill Tkachov, Jakub Jelinek

(Jakub Cc'd because of code he added for PR23567).

On 10/27/2015 11:35 PM, Abe wrote:
> Thanks for all your feedback.  I have integrated as much of it as I
> could in the available time.

Unfortunately not all of it - I still think we need to have a better 
strategy of selecting a scratchpad than a newly allocated stack slot. 
There are sufficiently many options.

>          * ifcvt.c (noce_mem_write_may_go_wrong_even_with_scratchpads):
> New.

ChangeLog doesn't correspond to the patch. If the function actually 
existed I'd reject it for a way overlong identifier.

>          * target.h: New enum named "RTL_ifcvt_when_to_use_scratchpads".

Please follow ChangeLog writing guidelines.

> -/* Return true if a write into MEM may trap or fault.  */
> +/* Return true if a write into MEM may trap or fault
> +   even in the presence of scratchpad support.  */

I still think this comment is fairly useless and needs to better 
describe what it is actually doing.

>   static bool
> -noce_mem_write_may_trap_or_fault_p (const_rtx mem)
> +noce_mem_write_may_trap_or_fault_p_1 (const_rtx mem)

For what this ends up doing, I think a name like "unsafe_address_p" 
would be better. Also, I think the code in there is really dubious - it 
tries to look for SYMBOL_REFs which are in decl_readonly_section, but 
shouldn't that be unnecessary given the test for MEM_READONLY_P? It's 
completely unreliable anyway since the address could be loaded into a 
register (on most targets it would be) and we'd never see the SYMBOL_REF 
in that function.

> +/* Return true if a write into MEM may trap or fault
> +   without scratchpad support.  */

Just keep the previous comment without mentioning scratchpads.

> +static bool
> +noce_mem_write_may_trap_or_fault_p (const_rtx mem)
> +{
> +  if (may_trap_or_fault_p (mem))
> +    return true;
> +
> +  return noce_mem_write_may_trap_or_fault_p_1 (mem);
> +}
> +      if (RTL_ifcvt_use_spads_as_per_profile
> +        == targetm.RTL_ifcvt_when_to_use_scratchpads
> +          && (PROFILE_ABSENT == profile_status_for_fn (cfun)
> +          || PROFILE_GUESSED == profile_status_for_fn (cfun)
> +          || predictable_edge_p (then_edge)
> +          || ! maybe_hot_bb_p (cfun, then_bb)))
> +        return FALSE;

I guess this is slightly better than no cost estimate at all.

> +      if (noce_mem_write_may_trap_or_fault_p_1 (orig_x)

So why do you want to call this function anyway? Doesn't the scratchpad 
technique protect against storing to a bad address?

> +      const size_t MEM_size = MEM_SIZE (orig_x);

No uppercase letters for variables and such. Just use "sz" or "size" for 
brevity.

> +          biggest_spad = assign_stack_local (GET_MODE (orig_x),

Still the same problems - this is the least attractive choice of a 
scratchpad location, and the code may end up allocating more scratchpads 
than you need.

> +      emit_insn_before_setloc (seq, if_info->jump,
> +                   INSN_LOCATION (if_info->insn_a));

Formatting? Could be mail client damage.

> +DEFHOOKPOD
> +(RTL_ifcvt_when_to_use_scratchpads,
> +"*",
> +enum RTL_ifcvt_when_to_use_scratchpads,
> RTL_ifcvt_use_spads_as_per_profile)
> +

No caps, and maybe a less clumsy name.

> +enum RTL_ifcvt_when_to_use_scratchpads {
> +  RTL_ifcvt_never_use_scratchpads = 0,
> +  RTL_ifcvt_always_use_scratchpads,
> +  RTL_ifcvt_use_spads_as_per_profile
> +};

Likewise. Maybe

enum ifcvt_scratchpads_strategy {
   scratchpad_never,
   scratchpad_always,
   scratchpad_unpredictable
};

So, still a NACK from my side.

Bernd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
       [not found] <024301d11106$2379b5f0$6a6d21d0$@samsung.com>
@ 2015-10-27 23:02 ` Abe
  2015-10-30 14:09   ` Bernd Schmidt
  0 siblings, 1 reply; 19+ messages in thread
From: Abe @ 2015-10-27 23:02 UTC (permalink / raw)
  To: gcc-patches, Sebastian Pop, Kyrill Tkachov, bernd Schmidt

[-- Attachment #1: Type: text/plain, Size: 10662 bytes --]

Dear all,

Thanks for all your feedback.  I have integrated as much of it as I could in the available time.

The revised patch now allows the target to decide when to use scratchpads,
the global default being to do so when the profile data indicates it is probably good to do so.

Bootstrapped and make-checked on x86_64 GNU/Linux.

Regards,

Abe








2015-10-06  Abe  <abe_skolnik@yahoo.com>

         * ifcvt.c (noce_mem_write_may_go_wrong_even_with_scratchpads): New.
         (noce_mem_write_may_trap_or_fault_p_1): New.
         (noce_mem_write_may_trap_or_fault_p): Moved most of it into
         "noce_mem_write_may_trap_or_fault_p_1".
         (noce_process_if_block): Added scratchpad-based support for if-converting
         half hammocks with writes to destinations GCC says may "trap or fault".
         (if_convert): Ensure the scratchpad-support vector is cleared upon
         each entry into the RTL if-converter.
         * target.def (RTL_ifcvt_when_to_use_scratchpads): New DEFHOOKPOD.
         * target.h: New enum named "RTL_ifcvt_when_to_use_scratchpads".






 From c24c9b7528a2865efe49e5abb812bfbbac1a5e1c Mon Sep 17 00:00:00 2001
From: Abe <abe_skolnik@yahoo.com>
Date: Tue, 27 Oct 2015 17:12:44 -0500
Subject: [PATCH] Adding my RTL ifcvt work on top of Ville`s upstream commit.

---
  gcc/ifcvt.c    | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
  gcc/target.def |   5 ++
  gcc/target.h   |   6 ++
  3 files changed, 160 insertions(+), 20 deletions(-)

diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 7ab738e..fb80ee9 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -56,6 +56,8 @@
  #include "rtl-iter.h"
  #include "ifcvt.h"

+#include <utility>
+
  #ifndef MAX_CONDITIONAL_EXECUTE
  #define MAX_CONDITIONAL_EXECUTE \
    (BRANCH_COST (optimize_function_for_speed_p (cfun), false) \
@@ -66,6 +68,9 @@

  #define NULL_BLOCK	((basic_block) NULL)

+/* An arbitrary inclusive maximum size (in bytes) for each scratchpad.  */
+#define SCRATCHPAD_MAX_SIZE 128
+
  /* True if after combine pass.  */
  static bool ifcvt_after_combine;

@@ -110,6 +115,8 @@ static int dead_or_predicable (basic_block, basic_block, basic_block,
  			       edge, int);
  static void noce_emit_move_insn (rtx, rtx);
  static rtx_insn *block_has_only_trap (basic_block);
+
+static auto_vec<std::pair<rtx, unsigned int> > scratchpads;
  \f
  /* Count the number of non-jump active insns in BB.  */

@@ -2828,19 +2835,16 @@ noce_operand_ok (const_rtx op)
    return ! may_trap_p (op);
  }

-/* Return true if a write into MEM may trap or fault.  */
+/* Return true if a write into MEM may trap or fault
+   even in the presence of scratchpad support.  */

  static bool
-noce_mem_write_may_trap_or_fault_p (const_rtx mem)
+noce_mem_write_may_trap_or_fault_p_1 (const_rtx mem)
  {
-  rtx addr;
-
    if (MEM_READONLY_P (mem))
      return true;

-  if (may_trap_or_fault_p (mem))
-    return true;
-
+  rtx addr;
    addr = XEXP (mem, 0);

    /* Call target hook to avoid the effects of -fpic etc....  */
@@ -2881,6 +2885,18 @@ noce_mem_write_may_trap_or_fault_p (const_rtx mem)
    return false;
  }

+/* Return true if a write into MEM may trap or fault
+   without scratchpad support.  */
+
+static bool
+noce_mem_write_may_trap_or_fault_p (const_rtx mem)
+{
+  if (may_trap_or_fault_p (mem))
+    return true;
+
+  return noce_mem_write_may_trap_or_fault_p_1 (mem);
+}
+
  /* Return whether we can use store speculation for MEM.  TOP_BB is the
     basic block above the conditional block where we are considering
     doing the speculative store.  We look for whether MEM is set
@@ -3031,7 +3047,7 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
     at converting the block.  */

  static int
-noce_process_if_block (struct noce_if_info *if_info)
+noce_process_if_block (struct noce_if_info *if_info, edge then_edge)
  {
    basic_block test_bb = if_info->test_bb;	/* test block */
    basic_block then_bb = if_info->then_bb;	/* THEN */
@@ -3047,7 +3063,7 @@ noce_process_if_block (struct noce_if_info *if_info)

       (1) if (...) x = a; else x = b;
       (2) x = b; if (...) x = a;
-     (3) if (...) x = a;   // as if with an initial x = x.
+     (3) if (...) x = a;

       The later patterns require jumps to be more expensive.
       For the if (...) x = a; else x = b; case we allow multiple insns
@@ -3200,17 +3216,127 @@ noce_process_if_block (struct noce_if_info *if_info)

    if (!set_b && MEM_P (orig_x))
      {
-      /* Disallow the "if (...) x = a;" form (implicit "else x = x;")
-	 for optimizations if writing to x may trap or fault,
-	 i.e. it's a memory other than a static var or a stack slot,
-	 is misaligned on strict aligned machines or is read-only.  If
-	 x is a read-only memory, then the program is valid only if we
-	 avoid the store into it.  If there are stores on both the
-	 THEN and ELSE arms, then we can go ahead with the conversion;
-	 either the program is broken, or the condition is always
-	 false such that the other memory is selected.  */
+      /* Disallow the "if (...) x = a;" form (with no "else") for optimizations
+	 when x is misaligned on strict-alignment machines or is read-only.
+	 If x is a memory other than a static var or a stack slot: for targets
+	 _with_ conditional move and _without_ conditional execution,
+	 convert using the scratchpad technique, otherwise don`t convert.
+	 If x is a read-only memory, then the program is valid only if we avoid
+	 the store into it.  If there are stores on both the THEN and ELSE arms,
+	 then we can go ahead with the conversion; either the program is broken,
+	 or the condition is always false such that the other memory is selected.
+	 The non-scratchpad-based conversion here has an implicit "else x = x;".  */
        if (noce_mem_write_may_trap_or_fault_p (orig_x))
-	return FALSE;
+	{
+	  if (reload_completed
+	      || optimize < 2
+	      || optimize_function_for_size_p (cfun)
+	      || targetm.have_conditional_execution ()
+	      || !HAVE_conditional_move
+	      || (CONSTANT_P (XEXP (cond, 0)) && CONSTANT_P (XEXP (cond, 1)))
+	      || RTL_ifcvt_never_use_scratchpads
+	           == targetm.RTL_ifcvt_when_to_use_scratchpads)
+	    return FALSE;
+
+
+	  if (RTL_ifcvt_use_spads_as_per_profile
+		== targetm.RTL_ifcvt_when_to_use_scratchpads
+	      && (PROFILE_ABSENT == profile_status_for_fn (cfun)
+		  || PROFILE_GUESSED == profile_status_for_fn (cfun)
+		  || predictable_edge_p (then_edge)
+		  || ! maybe_hot_bb_p (cfun, then_bb)))
+	    return FALSE;
+
+
+	  if (noce_mem_write_may_trap_or_fault_p_1 (orig_x)
+	      || !MEM_SIZE_KNOWN_P (orig_x))
+	    return FALSE;
+
+	  const size_t MEM_size = MEM_SIZE (orig_x);
+	  if (MEM_size > SCRATCHPAD_MAX_SIZE)
+	    return FALSE;
+
+	  rtx biggest_spad = 0;
+	  unsigned int biggest_spad_size = 0;
+	  if (size_t vec_len = scratchpads.length ())
+	    {
+	      std::pair<rtx, unsigned> tmp_pair = scratchpads[vec_len - 1];
+	      biggest_spad = tmp_pair.first;
+	      biggest_spad_size = tmp_pair.second;
+	    }
+	  if (MEM_size > biggest_spad_size)
+	    {
+	      biggest_spad_size = MEM_size;
+	      biggest_spad = assign_stack_local (GET_MODE (orig_x), MEM_size, 0);
+	      gcc_assert (biggest_spad);
+	      scratchpads.safe_push (std::make_pair (biggest_spad, MEM_size));
+	    }
+
+	  gcc_assert (biggest_spad);
+
+	  rtx reg_for_store_addr = gen_reg_rtx (Pmode);
+	  set_used_flags (reg_for_store_addr);
+
+	  start_sequence ();
+
+	  for (rtx_insn *insn = BB_HEAD (then_bb);
+	       insn && insn != insn_a && insn != BB_END (then_bb);
+	       insn = NEXT_INSN (insn))
+	    if (!(NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))
+	      duplicate_insn_chain (insn, insn);
+
+	  rtx target = noce_emit_cmove (if_info,
+					reg_for_store_addr,
+					GET_CODE (cond),
+					XEXP (cond, 0),
+					XEXP (cond, 1),
+					XEXP (orig_x, 0),
+					XEXP (biggest_spad, 0));
+
+	  if (!target)
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+	  if (target != reg_for_store_addr)
+	    noce_emit_move_insn (reg_for_store_addr, target);
+
+	  rtx mem = gen_rtx_MEM (GET_MODE (orig_x), reg_for_store_addr);
+	  MEM_NOTRAP_P (mem) = MEM_NOTRAP_P (orig_x);
+	  MEM_VOLATILE_P (mem) = MEM_VOLATILE_P (orig_x);
+
+	  alias_set_type temp_alias_set = new_alias_set ();
+	  if (MEM_ALIAS_SET (orig_x))
+	    record_alias_subset (MEM_ALIAS_SET (orig_x), temp_alias_set);
+	  set_mem_alias_set (mem, temp_alias_set);
+
+	  set_mem_align (mem, MIN (MEM_ALIGN (biggest_spad),
+				   MEM_ALIGN (orig_x)));
+	  if (MEM_ADDR_SPACE (orig_x) != MEM_ADDR_SPACE (biggest_spad))
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+
+	  set_used_flags (mem);
+
+	  noce_emit_move_insn (mem, a);
+
+	  rtx_insn *seq = end_ifcvt_sequence (if_info);
+	  if (!seq)
+	    return FALSE;
+
+	  unshare_all_rtl_in_chain (seq);
+
+	  /* Prevent the code right after "success:"
+	     from throwing away the changes.  */
+	  x = orig_x;
+
+	  emit_insn_before_setloc (seq, if_info->jump,
+				   INSN_LOCATION (if_info->insn_a));
+	  goto success;
+
+	}

        /* Avoid store speculation: given "if (...) x = a" where x is a
  	 MEM, we only want to do the store if x is always set
@@ -3696,7 +3822,7 @@ noce_find_if_block (basic_block test_bb, edge then_edge, edge else_edge,

    /* Do the real work.  */

-  if (noce_process_if_block (&if_info))
+  if (noce_process_if_block (&if_info, then_edge))
      return TRUE;

    if (HAVE_conditional_move
@@ -5003,6 +5129,9 @@ if_convert (bool after_combine)
    basic_block bb;
    int pass;

+  /* Ensure that we start the scratchpads data fresh each time.  */
+  scratchpads.truncate (0);
+
    if (optimize == 1)
      {
        df_live_add_problem ();
diff --git a/gcc/target.def b/gcc/target.def
index d29aad5..dc007e5 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -5915,6 +5915,11 @@ HOOK_VECTOR_END (mode_switching)
  #include "target-insns.def"
  #undef DEF_TARGET_INSN

+DEFHOOKPOD
+(RTL_ifcvt_when_to_use_scratchpads,
+"*",
+enum RTL_ifcvt_when_to_use_scratchpads, RTL_ifcvt_use_spads_as_per_profile)
+
  /* Close the 'struct gcc_target' definition.  */
  HOOK_VECTOR_END (C90_EMPTY_HACK)

diff --git a/gcc/target.h b/gcc/target.h
index a79f424..bbe4814 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -187,6 +187,12 @@ enum vect_cost_model_location {
  #define DEFHOOK_UNDOC DEFHOOK
  #define HOOKSTRUCT(FRAGMENT) FRAGMENT

+enum RTL_ifcvt_when_to_use_scratchpads {
+  RTL_ifcvt_never_use_scratchpads = 0,
+  RTL_ifcvt_always_use_scratchpads,
+  RTL_ifcvt_use_spads_as_per_profile
+};
+
  #include "target.def"

  extern struct gcc_target targetm;
-- 
2.1.0.243.g30d45f7


[-- Attachment #2: 0001-Adding-my-RTL-ifcvt-work-on-top-of-Ville-s-upstream-.patch.gz --]
[-- Type: application/x-gzip, Size: 3631 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-20  5:52           ` Jeff Law
@ 2015-10-20  9:37             ` Richard Biener
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Biener @ 2015-10-20  9:37 UTC (permalink / raw)
  To: Jeff Law; +Cc: Bernd Schmidt, Abe, Sebastian Pop, Kyrill Tkachov, gcc-patches

On Tue, Oct 20, 2015 at 7:43 AM, Jeff Law <law@redhat.com> wrote:
> On 10/14/2015 01:15 PM, Bernd Schmidt wrote:
>>
>> On 10/14/2015 07:43 PM, Jeff Law wrote:
>>>
>>> Obviously some pessimization relative to current code is necessary to
>>> fix some of the problems WRT thread safety and avoiding things like
>>> introducing faults in code which did not previously fault.
>>
>>
>> Huh? This patch is purely an (attempt at) optimization, not something
>> that fixes any problems.
>
> Then I must be mentally merging two things Abe has been working on then.
> He's certainly had an if-converter patch that was designed to avoid
> introducing races in code that didn't previously have races.
>
> Looking back through the archives that appears to be the case. His patches
> to avoid racing are for the tree level if converter, not the RTL if
> converter.

Even for the tree level this wasn't the case, he just run into a bug
of the existing
converter that I've fixed meanwhile.

> Sigh, sorry for the confusion.  It's totally my fault.  Assuming Abe doesn't
> have a correctness case at all here, then I don't see any way for the code
> to go forward as-is since it's likely making things significantly worse.
>
>>
>> I can't test valgrind right now, it fails to run on my machine, but I
>> guess it could adapt to allow stores slightly below the stack (maybe
>> warning once)? It seems like a bit of an edge case to worry about, but
>> if supporting it is critical and it can't be changed to adapt to new
>> optimizations, then I think we're probably better off entirely without
>> this scratchpad transformation.
>>
>> Alternatively I can think of a few other possible approaches which
>> wouldn't require this kind of bloat:
>>   * add support for allocating space in the stack redzone. That could be
>>     interesting for the register allocator as well. Would help only
>>     x86_64, but that's a large fraction of gcc's userbase.
>>   * add support for opportunistically finding unused alignment padding
>>     in the existing stack frame. Less likely to work but would produce
>>     better results when it does.
>>   * on embedded targets we probably don't have to worry about valgrind,
>>     so do the optimal (sp - x) thing there
>>   * allocate a single global as the dummy target. Might be more
>>     expensive to load the address on some targets though.
>>   * at least find a way to express costs for this transformation.
>>     Difficult since you don't yet necessarily know if the function is
>>     going to have a stack frame. Hence, IMO this approach is flawed.
>>     (You'll still want cost estimates even when not allocating stuff in
>>     the normal stack frame, because generated code will still execute
>>     between two and four extra instructions).
>
> One could argue these should all be on the table.  However, I tend to really
> dislike using area beyond the current stack.  I realize it's throw-away
> data, but it just seems like a bad idea to me -- even on embedded targets
> that don't support valgrind.
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-14 19:15         ` Bernd Schmidt
  2015-10-15  8:52           ` Richard Biener
@ 2015-10-20  5:52           ` Jeff Law
  2015-10-20  9:37             ` Richard Biener
  1 sibling, 1 reply; 19+ messages in thread
From: Jeff Law @ 2015-10-20  5:52 UTC (permalink / raw)
  To: Bernd Schmidt, Abe; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

On 10/14/2015 01:15 PM, Bernd Schmidt wrote:
> On 10/14/2015 07:43 PM, Jeff Law wrote:
>> Obviously some pessimization relative to current code is necessary to
>> fix some of the problems WRT thread safety and avoiding things like
>> introducing faults in code which did not previously fault.
>
> Huh? This patch is purely an (attempt at) optimization, not something
> that fixes any problems.
Then I must be mentally merging two things Abe has been working on then. 
  He's certainly had an if-converter patch that was designed to avoid 
introducing races in code that didn't previously have races.

Looking back through the archives that appears to be the case. His 
patches to avoid racing are for the tree level if converter, not the RTL 
if converter.

Sigh, sorry for the confusion.  It's totally my fault.  Assuming Abe 
doesn't have a correctness case at all here, then I don't see any way 
for the code to go forward as-is since it's likely making things 
significantly worse.

>
> I can't test valgrind right now, it fails to run on my machine, but I
> guess it could adapt to allow stores slightly below the stack (maybe
> warning once)? It seems like a bit of an edge case to worry about, but
> if supporting it is critical and it can't be changed to adapt to new
> optimizations, then I think we're probably better off entirely without
> this scratchpad transformation.
>
> Alternatively I can think of a few other possible approaches which
> wouldn't require this kind of bloat:
>   * add support for allocating space in the stack redzone. That could be
>     interesting for the register allocator as well. Would help only
>     x86_64, but that's a large fraction of gcc's userbase.
>   * add support for opportunistically finding unused alignment padding
>     in the existing stack frame. Less likely to work but would produce
>     better results when it does.
>   * on embedded targets we probably don't have to worry about valgrind,
>     so do the optimal (sp - x) thing there
>   * allocate a single global as the dummy target. Might be more
>     expensive to load the address on some targets though.
>   * at least find a way to express costs for this transformation.
>     Difficult since you don't yet necessarily know if the function is
>     going to have a stack frame. Hence, IMO this approach is flawed.
>     (You'll still want cost estimates even when not allocating stuff in
>     the normal stack frame, because generated code will still execute
>     between two and four extra instructions).
One could argue these should all be on the table.  However, I tend to 
really dislike using area beyond the current stack.  I realize it's 
throw-away data, but it just seems like a bad idea to me -- even on 
embedded targets that don't support valgrind.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-14 19:15         ` Bernd Schmidt
@ 2015-10-15  8:52           ` Richard Biener
  2015-10-20  5:52           ` Jeff Law
  1 sibling, 0 replies; 19+ messages in thread
From: Richard Biener @ 2015-10-15  8:52 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Jeff Law, Abe, Sebastian Pop, Kyrill Tkachov, gcc-patches

On Wed, Oct 14, 2015 at 9:15 PM, Bernd Schmidt <bschmidt@redhat.com> wrote:
> On 10/14/2015 07:43 PM, Jeff Law wrote:
>>
>> Obviously some pessimization relative to current code is necessary to
>> fix some of the problems WRT thread safety and avoiding things like
>> introducing faults in code which did not previously fault.
>
>
> Huh? This patch is purely an (attempt at) optimization, not something that
> fixes any problems.
>
>> However, pessimization of safe code is, err, um, bad and needs to be
>> avoided.
>
>
> Here's an example:
>
>                                   >         subq    $16, %rsp
> [...]
>                                   >         leaq    8(%rsp), %r8
>                                   >         leaq    256(%rax), %rdx
>     cmpq    256(%rax), %rcx       |         cmpq    256(%rax), %rsi
>     jne    .L97                   <
>     movq    $0, 256(%rax)         <
> .L97:                             <
>                                   >         movq    %rdx, %rax
>                                   >         cmovne  %r8, %rax
>                                   >         movq    $0, (%rax)
> [...]
>                                   >         addq    $16, %rsp
>
> In the worst case that executes six more instructions, and always causes
> unnecessary stack frame bloat. This on x86 where AFAIK it's doubtful whether
> cmov is a win at all anyway. I think this shows the approach is just bad,
> even ignoring problems like that it could allocate multiple scratchpads when
> one would suffice, or allocate one and end up not using it because the
> transformation fails.
>
> I can't test valgrind right now, it fails to run on my machine, but I guess
> it could adapt to allow stores slightly below the stack (maybe warning
> once)? It seems like a bit of an edge case to worry about, but if supporting
> it is critical and it can't be changed to adapt to new optimizations, then I
> think we're probably better off entirely without this scratchpad
> transformation.
>
> Alternatively I can think of a few other possible approaches which wouldn't
> require this kind of bloat:
>  * add support for allocating space in the stack redzone. That could be
>    interesting for the register allocator as well. Would help only
>    x86_64, but that's a large fraction of gcc's userbase.
>  * add support for opportunistically finding unused alignment padding
>    in the existing stack frame. Less likely to work but would produce
>    better results when it does.
>  * on embedded targets we probably don't have to worry about valgrind,
>    so do the optimal (sp - x) thing there
>  * allocate a single global as the dummy target. Might be more
>    expensive to load the address on some targets though.
>  * at least find a way to express costs for this transformation.
>    Difficult since you don't yet necessarily know if the function is
>    going to have a stack frame. Hence, IMO this approach is flawed.
>    (You'll still want cost estimates even when not allocating stuff in
>    the normal stack frame, because generated code will still execute
>    between two and four extra instructions).

Btw, I agree with all your concerns and indeed finding a better place
we can always store to is required.  I wonder if we can do the actual
location assignment after reload (searching for (temporarily) unused
already allocated stack space for example).  Using the red-zone also
crossed my mind.

As for performance of the resulting code I'd allow the target to
decide whether to never do this, whether to do this only with
profile-feedback (and 50%/50% chance) or whether to do this
always.

Most branch predictors have issues with high branch density so
if you have a sequence of

 if (a) *p = ..;
 if (b) *q = ..;
 if (c) *r = ..;

then the jump form may have a big penalty (or a lot of padding
to avoid it).  I'm not sure whether conditional moves in that case
behave better (no idea if they also enter the prediction machinery
for speculative execution).

Richard.

>
> Bernd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-14 17:43       ` Jeff Law
@ 2015-10-14 19:15         ` Bernd Schmidt
  2015-10-15  8:52           ` Richard Biener
  2015-10-20  5:52           ` Jeff Law
  0 siblings, 2 replies; 19+ messages in thread
From: Bernd Schmidt @ 2015-10-14 19:15 UTC (permalink / raw)
  To: Jeff Law, Abe; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

On 10/14/2015 07:43 PM, Jeff Law wrote:
> Obviously some pessimization relative to current code is necessary to
> fix some of the problems WRT thread safety and avoiding things like
> introducing faults in code which did not previously fault.

Huh? This patch is purely an (attempt at) optimization, not something 
that fixes any problems.

> However, pessimization of safe code is, err, um, bad and needs to be
> avoided.

Here's an example:

                                   >         subq    $16, %rsp
[...]
                                   >         leaq    8(%rsp), %r8
                                   >         leaq    256(%rax), %rdx
     cmpq    256(%rax), %rcx       |         cmpq    256(%rax), %rsi
     jne    .L97                   <
     movq    $0, 256(%rax)         <
.L97:                             <
                                   >         movq    %rdx, %rax
                                   >         cmovne  %r8, %rax
                                   >         movq    $0, (%rax)
[...]
                                   >         addq    $16, %rsp

In the worst case that executes six more instructions, and always causes 
unnecessary stack frame bloat. This on x86 where AFAIK it's doubtful 
whether cmov is a win at all anyway. I think this shows the approach is 
just bad, even ignoring problems like that it could allocate multiple 
scratchpads when one would suffice, or allocate one and end up not using 
it because the transformation fails.

I can't test valgrind right now, it fails to run on my machine, but I 
guess it could adapt to allow stores slightly below the stack (maybe 
warning once)? It seems like a bit of an edge case to worry about, but 
if supporting it is critical and it can't be changed to adapt to new 
optimizations, then I think we're probably better off entirely without 
this scratchpad transformation.

Alternatively I can think of a few other possible approaches which 
wouldn't require this kind of bloat:
  * add support for allocating space in the stack redzone. That could be
    interesting for the register allocator as well. Would help only
    x86_64, but that's a large fraction of gcc's userbase.
  * add support for opportunistically finding unused alignment padding
    in the existing stack frame. Less likely to work but would produce
    better results when it does.
  * on embedded targets we probably don't have to worry about valgrind,
    so do the optimal (sp - x) thing there
  * allocate a single global as the dummy target. Might be more
    expensive to load the address on some targets though.
  * at least find a way to express costs for this transformation.
    Difficult since you don't yet necessarily know if the function is
    going to have a stack frame. Hence, IMO this approach is flawed.
    (You'll still want cost estimates even when not allocating stuff in
    the normal stack frame, because generated code will still execute
    between two and four extra instructions).


Bernd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-14  8:29     ` Eric Botcazou
@ 2015-10-14 17:46       ` Jeff Law
  0 siblings, 0 replies; 19+ messages in thread
From: Jeff Law @ 2015-10-14 17:46 UTC (permalink / raw)
  To: Eric Botcazou, Richard Henderson
  Cc: gcc-patches, Bernd Schmidt, Abe, Sebastian Pop, Kyrill Tkachov

On 10/14/2015 02:28 AM, Eric Botcazou wrote:
>> If you're using one of the switches that checks for stack overflow at the
>> start of the function, you certainly don't want to do any such stores.
>
> There is a protection area for -fstack-check (STACK_CHECK_PROTECT bytes) so
> you can do stores just below the stack pointer as far as it's concerned.
>
> There is indeed the issue of the mere writing below the stack pointer.  Our
> experience with various OSes and architectures shows that this almost always
> works.  The only problematic case is x86{-64}/Linux historically, where you
> cannot write below the page pointed to by the stack pointer (that's why there
> is a specific implementation of -fstack-check for x86{-64}/Linux).
It was problematical on the PA, but I can't recall precisely why.

The thing we need to remember here is that if we do somethig like use 
space just below the stack pointer, valgrind is probably going to start 
complaining (and legitimately so).

While we know the result is throw-away, valgrind doesn't and the 
complains and noise from this would IMHO outweigh the benefits from 
using the trick of reading outside the defined stack area.

jeff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-13 20:16     ` Bernd Schmidt
@ 2015-10-14 17:43       ` Jeff Law
  2015-10-14 19:15         ` Bernd Schmidt
  0 siblings, 1 reply; 19+ messages in thread
From: Jeff Law @ 2015-10-14 17:43 UTC (permalink / raw)
  To: Bernd Schmidt, Abe; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

On 10/13/2015 02:16 PM, Bernd Schmidt wrote:
>> _Potentially_ so, yes.  However, GCC is free to put the allocation into
>> an otherwise-unused part of the stack frame.
>
> Well, I looked at code generation changes, and it usually seems to come
> with an increase in stack frame size - sometimes causing extra
> instructions to be emitted.
I think that's essentially unavoidable when we end up using the scratchpad.

>
>>> However, why do we need to allocate anything in the first place?
>>  > If you want to store something that will be thrown away,
>>  > just pick an address below the stack pointer.
>>
>> Because allocating a scratchpad should work on all relevant targets.  We
>> do not have the resources to test on all GCC-supported
>> CPU ISAs and on all GCC-supported OSes, and we would like to have an
>> optimization that works on as many targets as makes sense
>> [those with cmove-like ability and withOUT full-blown conditional
>> execution].
>
> Yeah, but if you put in a new facility like this, chances are
> maintainers for active targets will pick it up and add the necessary
> hooks. That's certainly what happened with shrink-wrapping. So I don't
> think this is a concern.
But won't you get valgrind warnings if the code loads/stores outside the 
defined stack?  While we know it's safe, the warnings from valgrind will 
likely cause a backlash of user complaints.

>
> I'm afraid I'll have to reject the patch then, on these grounds:
>   * it may pessimize code
>   * it does not even estimate costs to attempt avoiding this
>   * a much simpler, more efficient implementation is possible.
Noted.  I think the pessimization is the area were folks are most concerned.

Obviously some pessimization relative to current code is necessary to 
fix some of the problems WRT thread safety and avoiding things like 
introducing faults in code which did not previously fault.

However, pessimization of safe code is, err, um, bad and needs to be 
avoided.

Jeff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-14  1:05   ` Richard Henderson
  2015-10-14  1:11     ` Richard Henderson
@ 2015-10-14  8:29     ` Eric Botcazou
  2015-10-14 17:46       ` Jeff Law
  1 sibling, 1 reply; 19+ messages in thread
From: Eric Botcazou @ 2015-10-14  8:29 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Bernd Schmidt, Abe, Sebastian Pop, Kyrill Tkachov

> If you're using one of the switches that checks for stack overflow at the
> start of the function, you certainly don't want to do any such stores.

There is a protection area for -fstack-check (STACK_CHECK_PROTECT bytes) so 
you can do stores just below the stack pointer as far as it's concerned.

There is indeed the issue of the mere writing below the stack pointer.  Our 
experience with various OSes and architectures shows that this almost always 
works.  The only problematic case is x86{-64}/Linux historically, where you 
cannot write below the page pointed to by the stack pointer (that's why there 
is a specific implementation of -fstack-check for x86{-64}/Linux).

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-14  1:05   ` Richard Henderson
@ 2015-10-14  1:11     ` Richard Henderson
  2015-10-14  8:29     ` Eric Botcazou
  1 sibling, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2015-10-14  1:11 UTC (permalink / raw)
  To: Bernd Schmidt, Abe, gcc-patches, Sebastian Pop, Kyrill Tkachov

On 10/14/2015 12:05 PM, Richard Henderson wrote:
> If you're using one of the switches that checks for stack overflow at the start
> of the function, you certainly don't want to do any such stores.

Oh, and for a given target the kernel may consider any write to the stack vma 
below the stack pointer as invalid.

The x86 kernels will at least handle "enter $65535, $31", which can write to a 
bit more than 64k below %esp before %esp gets updated, but that's probably not 
going to be true of most risc targets.

r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-08 13:23 ` Bernd Schmidt
  2015-10-13 19:34   ` Abe
@ 2015-10-14  1:05   ` Richard Henderson
  2015-10-14  1:11     ` Richard Henderson
  2015-10-14  8:29     ` Eric Botcazou
  1 sibling, 2 replies; 19+ messages in thread
From: Richard Henderson @ 2015-10-14  1:05 UTC (permalink / raw)
  To: Bernd Schmidt, Abe, gcc-patches, Sebastian Pop, Kyrill Tkachov

On 10/09/2015 12:23 AM, Bernd Schmidt wrote:
> On 10/08/2015 01:29 AM, Abe wrote:
>> Attached please find my revised patch to the RTL if converter.  This
>> patch enables the
>> if-conversion of half-hammocks with a store in them that the internal
>> GCC machinery
>> otherwise considers too hazardous to if-convert.  This is made safe by
>> using the
>> "scratchpad" technique, i.e. throwing away the store into a safe
>> location where nothing
>> of any importance is currently stored.  The scratchpads are allocated in
>> the stack frame.
>
> So, one conceptual issue first. Obviously this increases the size of the stack
> frame, which makes the transformation more expensive. The patch does not appear
> to attempt to estimate costs. However, why do we need to allocate anything in
> the first place? If you want to store something that will be thrown away, just
> pick an address below the stack pointer.

If you're using one of the switches that checks for stack overflow at the start 
of the function, you certainly don't want to do any such stores.


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-13 19:34   ` Abe
@ 2015-10-13 20:16     ` Bernd Schmidt
  2015-10-14 17:43       ` Jeff Law
  0 siblings, 1 reply; 19+ messages in thread
From: Bernd Schmidt @ 2015-10-13 20:16 UTC (permalink / raw)
  To: Abe; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

> _Potentially_ so, yes.  However, GCC is free to put the allocation into
> an otherwise-unused part of the stack frame.

Well, I looked at code generation changes, and it usually seems to come 
with an increase in stack frame size - sometimes causing extra 
instructions to be emitted.

>> However, why do we need to allocate anything in the first place?
>  > If you want to store something that will be thrown away,
>  > just pick an address below the stack pointer.
>
> Because allocating a scratchpad should work on all relevant targets.  We
> do not have the resources to test on all GCC-supported
> CPU ISAs and on all GCC-supported OSes, and we would like to have an
> optimization that works on as many targets as makes sense
> [those with cmove-like ability and withOUT full-blown conditional
> execution].

Yeah, but if you put in a new facility like this, chances are 
maintainers for active targets will pick it up and add the necessary 
hooks. That's certainly what happened with shrink-wrapping. So I don't 
think this is a concern.

> I agree that your suggestion of having one global default scratchpad
> allocation policy plus per-target
> overrides that are more efficient _is_ a good one, but it will have to
> wait a while for implementation
> if that`s to be done by me.  In the meantime, the existing allocation
> policy is compatible with
> multiple targets and costs very little space in the stack frame, if and
> when any at all.

I'm afraid I'll have to reject the patch then, on these grounds:
  * it may pessimize code
  * it does not even estimate costs to attempt avoiding this
  * a much simpler, more efficient implementation is possible.

>>> +        MEM_NOTRAP_P (mem) = true;
>> So I'm still not entirely sure which cases you are trying to optimize
>> and which ones not,
>
> The current patch focuses entirely on half-hammock writes with stores to
> addresses
> about which GCC "feels nervous", i.e. "may trap or fault"; for example:
>
>    if (condition)
>      *pointer = 9;
>    // no "else" or "else if"
>
>
>> but couldn't this technique allow a trapping store here?
>
> The purpose of the new if-conversion is to take a may-trap-or-fault
> store and replace it with a store
> that will be OK if the original program was OK with respect to the
> current execution`s inputs,
> environment, PRNG results, etc.  For example, the only way the
> if-converted code would dereference a
> null pointer is if/when the original program would have done the same
> thing under the same conditions.

Yeah, but it could still trap if the original program had an error. So I 
don't think setting MEM_NOTRAP_P is right.


Bernd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-08 13:23 ` Bernd Schmidt
@ 2015-10-13 19:34   ` Abe
  2015-10-13 20:16     ` Bernd Schmidt
  2015-10-14  1:05   ` Richard Henderson
  1 sibling, 1 reply; 19+ messages in thread
From: Abe @ 2015-10-13 19:34 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

[Bernd wrote:]

> Obviously this increases the size of the stack frame,

_Potentially_ so, yes.  However, GCC is free to put the allocation into an otherwise-unused part of the stack frame.

> The patch does not appear to attempt to estimate costs.

There is nothing new relating to cost estimation in my patch, that`s true.

As some of you already know, James Greenhalgh has been working on cost analysis in "ifcvt.c" recently,
as can be seen, for example, here:  https://gcc.gnu.org/ml/gcc-patches/2015-09/msg01971.html

> However, why do we need to allocate anything in the first place?
 > If you want to store something that will be thrown away,
 > just pick an address below the stack pointer.

Because allocating a scratchpad should work on all relevant targets.  We do not have the resources to test on all GCC-supported
CPU ISAs and on all GCC-supported OSes, and we would like to have an optimization that works on as many targets as makes sense
[those with cmove-like ability and withOUT full-blown conditional execution].  In other words, we chose to "play it safe".
We considered the "just past the stack pointer" technique early on in this project, but judged it as too risky to apply
to all targets w/o testing either on those targets or at least on an emulator/simulator of those targets.

I agree that your suggestion of having one global default scratchpad allocation policy plus per-target
overrides that are more efficient _is_ a good one, but it will have to wait a while for implementation
if that`s to be done by me.  In the meantime, the existing allocation policy is compatible with
multiple targets and costs very little space in the stack frame, if and when any at all.

Thanks for your [Bernd`s] other feedback, not copied here; I have applied lots of corrections and improvements as a result.

>> +        MEM_NOTRAP_P (mem) = true;
> So I'm still not entirely sure which cases you are trying to optimize and which ones not,

The current patch focuses entirely on half-hammock writes with stores to addresses
about which GCC "feels nervous", i.e. "may trap or fault"; for example:

   if (condition)
     *pointer = 9;
   // no "else" or "else if"

> but couldn't this technique allow a trapping store here?

The purpose of the new if-conversion is to take a may-trap-or-fault store and replace it with a store
that will be OK if the original program was OK with respect to the current execution`s inputs,
environment, PRNG results, etc.  For example, the only way the if-converted code would dereference a
null pointer is if/when the original program would have done the same thing under the same conditions.

Regards,

Abe

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-08 13:20   ` Sebastian Pop
@ 2015-10-08 13:26     ` Bernd Schmidt
  0 siblings, 0 replies; 19+ messages in thread
From: Bernd Schmidt @ 2015-10-08 13:26 UTC (permalink / raw)
  To: Sebastian Pop, Abe; +Cc: gcc-patches, Kyrill Tkachov

> + /* We must copy the insns between the start of the THEN block
> +   and the set of 'a', if they exist, since they may be needed
> +   for the converted code as well, but we must not copy a
> +   start-of-BB note if one is present, nor debug "insn"s.  */
> +
> + for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
> +  && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
> +  {
>
> Please remove the braces: the loop body is a single stmt.

Oh, I miscounted. That makes seven then.


Bernd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-07 23:29 Abe
  2015-10-08 13:09 ` Sebastian Pop
@ 2015-10-08 13:23 ` Bernd Schmidt
  2015-10-13 19:34   ` Abe
  2015-10-14  1:05   ` Richard Henderson
  1 sibling, 2 replies; 19+ messages in thread
From: Bernd Schmidt @ 2015-10-08 13:23 UTC (permalink / raw)
  To: Abe, gcc-patches, Sebastian Pop, Kyrill Tkachov

On 10/08/2015 01:29 AM, Abe wrote:
> Attached please find my revised patch to the RTL if converter.  This
> patch enables the
> if-conversion of half-hammocks with a store in them that the internal
> GCC machinery
> otherwise considers too hazardous to if-convert.  This is made safe by
> using the
> "scratchpad" technique, i.e. throwing away the store into a safe
> location where nothing
> of any importance is currently stored.  The scratchpads are allocated in
> the stack frame.

So, one conceptual issue first. Obviously this increases the size of the 
stack frame, which makes the transformation more expensive. The patch 
does not appear to attempt to estimate costs. However, why do we need to 
allocate anything in the first place? If you want to store something 
that will be thrown away, just pick an address below the stack pointer. 
I think some ports may need different strategies due to stack bias or 
red zones, so a target hook is in order, with one safe default to fail, 
and one default implementation that can be used by most targets, and 
then specialized versions in target-dependent code where necessary:

rtx
default_get_scratchpad_fail (HOST_WIDE_INT size)
{
   return NULL_RTX;
}

rtx
default_get_scratchpad (HOST_WIDE_INT size)
{
   /* Possibly also take STACK_BOUNDARY into account so as to not
      make unaligned locations.  */
   if (size >= param (SCRATCHPAD_MAX_SIZE))
     return NULL_RTX;
   return plus_constant (stack_pointer_rtx, gen_int_mode (-size, Pmode));
}

With that, I think all the code to keep track of scratchpads can just be 
deleted.

There's this preexisting comment:

/* ??? This is overconservative. Storing to two different mems is
as easy as conditionally computing the address. Storing to a
single mem merely requires a scratch memory to use as one of the
destination addresses; often the memory immediately below the
stack pointer is available for this. */

suggesting that it ought to be possible to generalize the technique to 
stores to different addresses.

To the patch itself. The code still has many stylistic problems and does 
not follow the required guidelines.

> +#include "diagnostic-color.h"

Why?

>
> -/* Return true if a write into MEM may trap or fault.  */
> +/* Return true if a write into MEM may trap or fault
> +   even in the presence of scratchpad support.  */

>
> +/* Return true if a write into MEM may trap or fault
> +   without scratchpad support.  */

Please explain the rationale for these changes. What exactly is 
different with scratchpads?

> +	  /* The next "if": quoting "noce_emit_cmove":
> +	       If we can't create new pseudos, though, don't bother.  */
> +	  if (reload_completed)
> +	    return FALSE;
> +
> +	  if (optimize<2)
> +	    return FALSE;
> +
> +	  if (optimize_function_for_size_p (cfun))
> +	    return FALSE;
> +
> +	  if (targetm.have_conditional_execution () || ! HAVE_conditional_move)
> +	    return FALSE;

Merge the conditions into one if. Watch spacing around operators.

> +
> +	  const bool not_a_scratchpad_candidate =
> +	    noce_mem_write_may_trap_or_fault_p_1 (orig_x);
> +	  if (! not_a_scratchpad_candidate)

The = should start a line, but what you really should do is just put the 
condition into the if and eliminate the variable.

> +	      const size_t size_of_MEM = MEM_SIZE (orig_x);

Identifiers are still too verbose. This is typically just called size, 
or memsize if there are other sizes to keep track of.

> +
> +		for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
> +		  && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
> +		  {
> +		     if (! (NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))

There are six different coding style violations in this block. Please 
identify and fix them (elsewhere as well). In addition, I think it would 
be better to start each part of the for statement on its own line for 
clarity.

I still need to figure out what is going on in this insn-copying loop.

 > +		/* Done copying the needed insns between the start of the
 > +		   THEN block and the set of 'a', if any.  */
 > +
 > +		if (CONSTANT_P (XEXP (cond, 0)) && CONSTANT_P (XEXP (cond, 1)))
 > +		  {
 > +		    end_sequence ();
 > +		    return FALSE;
 > +		  }

This should be done earlier before you go to the effort of copying insns.

> +		MEM_NOTRAP_P (mem) = true;

So I'm still not entirely sure which cases you are trying to optimize 
and which ones not, but couldn't this technique allow a trapping store here?

Bernd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-08 13:09 ` Sebastian Pop
@ 2015-10-08 13:20   ` Sebastian Pop
  2015-10-08 13:26     ` Bernd Schmidt
  0 siblings, 1 reply; 19+ messages in thread
From: Sebastian Pop @ 2015-10-08 13:20 UTC (permalink / raw)
  To: Abe; +Cc: gcc-patches, Bernd Schmidt, Kyrill Tkachov

Abe,

please avoid comments that are not needed.

+ /* We must copy the insns between the start of the THEN block
+   and the set of 'a', if they exist, since they may be needed
+   for the converted code as well, but we must not copy a
+   start-of-BB note if one is present, nor debug "insn"s.  */
+
+ for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
+  && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
+  {

Please remove the braces: the loop body is a single stmt.

+     if (! (NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))
+       duplicate_insn_chain (insn, insn);
+       /* A return of 0 from "duplicate_insn_chain" is _not_
+  a failure; it just returns the "NEXT_INSN" of the
+  last insn it duplicated.  */

Please remove this comment.

+  }
+
+ /* Done copying the needed insns between the start of the
+   THEN block and the set of 'a', if any.  */

This comment duplicates the same content as the comment before the loop.
Please remove.


On Thu, Oct 8, 2015 at 8:08 AM, Sebastian Pop <sebpop@gmail.com> wrote:
> Hi Abe,
>
> could you please avoid double negations, and
> please use early returns rather than huge right indentations:
>
> +  if (! not_a_scratchpad_candidate)
> +  {
> +    if (MEM_SIZE_KNOWN_P (orig_x))
> +    {
> +      const size_t size_of_MEM = MEM_SIZE (orig_x);
> +
> +      if (size_of_MEM <= SCRATCHPAD_MAX_SIZE)
> +      {
> [...]
> +      }
> +    }
> +  }
> +  return FALSE;
>
> Just rewrite as:
>
> if (not_a_scratchpad_candidate
>     || !MEM_SIZE_KNOWN_P (orig_x))
>   return FALSE;
>
> const size_t size_of_MEM = MEM_SIZE (orig_x);
> if (size_of_MEM > SCRATCHPAD_MAX_SIZE)
>   return FALSE;
>
> That will save 3 levels of indent.
>
> Also some of your braces do not seem to be correctly placed.
> Please use clang-format on your patch to solve the indentation issues.
>
> Thanks,
> Sebastian
>
>
> On Wed, Oct 7, 2015 at 6:29 PM, Abe <abe_skolnik@yahoo.com> wrote:
>> Dear all,
>>
>> Attached please find my revised patch to the RTL if converter.  This patch
>> enables the
>> if-conversion of half-hammocks with a store in them that the internal GCC
>> machinery
>> otherwise considers too hazardous to if-convert.  This is made safe by using
>> the
>> "scratchpad" technique, i.e. throwing away the store into a safe location
>> where nothing
>> of any importance is currently stored.  The scratchpads are allocated in the
>> stack frame.
>>
>> Here is an example of code which is newly converted with this patch,
>> at least when targeting AArch64:
>>
>>   int A[10];
>>
>>   void half_hammock() {
>>     if (A[0])
>>       A[1] = 2;
>>   }
>>
>>
>> Both tested against trunk and bootstrapped OK with defaults* on
>> AMD64-AKA-"x86_64" GNU/Linux.
>>
>> '*': [except for "--prefix"]
>>
>>
>> I`m sending the patch as an attachment to avoid it
>> being corrupted/reformatted by any e-mail troubles.
>>
>> I look forward to your feedback.
>>
>> Regards,
>>
>> Abe
>>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: using scratchpads to enhance RTL-level if-conversion: revised patch
  2015-10-07 23:29 Abe
@ 2015-10-08 13:09 ` Sebastian Pop
  2015-10-08 13:20   ` Sebastian Pop
  2015-10-08 13:23 ` Bernd Schmidt
  1 sibling, 1 reply; 19+ messages in thread
From: Sebastian Pop @ 2015-10-08 13:09 UTC (permalink / raw)
  To: Abe; +Cc: gcc-patches, Bernd Schmidt, Kyrill Tkachov

Hi Abe,

could you please avoid double negations, and
please use early returns rather than huge right indentations:

+  if (! not_a_scratchpad_candidate)
+  {
+    if (MEM_SIZE_KNOWN_P (orig_x))
+    {
+      const size_t size_of_MEM = MEM_SIZE (orig_x);
+
+      if (size_of_MEM <= SCRATCHPAD_MAX_SIZE)
+      {
[...]
+      }
+    }
+  }
+  return FALSE;

Just rewrite as:

if (not_a_scratchpad_candidate
    || !MEM_SIZE_KNOWN_P (orig_x))
  return FALSE;

const size_t size_of_MEM = MEM_SIZE (orig_x);
if (size_of_MEM > SCRATCHPAD_MAX_SIZE)
  return FALSE;

That will save 3 levels of indent.

Also some of your braces do not seem to be correctly placed.
Please use clang-format on your patch to solve the indentation issues.

Thanks,
Sebastian


On Wed, Oct 7, 2015 at 6:29 PM, Abe <abe_skolnik@yahoo.com> wrote:
> Dear all,
>
> Attached please find my revised patch to the RTL if converter.  This patch
> enables the
> if-conversion of half-hammocks with a store in them that the internal GCC
> machinery
> otherwise considers too hazardous to if-convert.  This is made safe by using
> the
> "scratchpad" technique, i.e. throwing away the store into a safe location
> where nothing
> of any importance is currently stored.  The scratchpads are allocated in the
> stack frame.
>
> Here is an example of code which is newly converted with this patch,
> at least when targeting AArch64:
>
>   int A[10];
>
>   void half_hammock() {
>     if (A[0])
>       A[1] = 2;
>   }
>
>
> Both tested against trunk and bootstrapped OK with defaults* on
> AMD64-AKA-"x86_64" GNU/Linux.
>
> '*': [except for "--prefix"]
>
>
> I`m sending the patch as an attachment to avoid it
> being corrupted/reformatted by any e-mail troubles.
>
> I look forward to your feedback.
>
> Regards,
>
> Abe
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* using scratchpads to enhance RTL-level if-conversion: revised patch
@ 2015-10-07 23:29 Abe
  2015-10-08 13:09 ` Sebastian Pop
  2015-10-08 13:23 ` Bernd Schmidt
  0 siblings, 2 replies; 19+ messages in thread
From: Abe @ 2015-10-07 23:29 UTC (permalink / raw)
  To: gcc-patches, Sebastian Pop, Bernd Schmidt, Kyrill Tkachov

[-- Attachment #1: Type: text/plain, Size: 903 bytes --]

Dear all,

Attached please find my revised patch to the RTL if converter.  This patch enables the
if-conversion of half-hammocks with a store in them that the internal GCC machinery
otherwise considers too hazardous to if-convert.  This is made safe by using the
"scratchpad" technique, i.e. throwing away the store into a safe location where nothing
of any importance is currently stored.  The scratchpads are allocated in the stack frame.

Here is an example of code which is newly converted with this patch,
at least when targeting AArch64:

   int A[10];

   void half_hammock() {
     if (A[0])
       A[1] = 2;
   }


Both tested against trunk and bootstrapped OK with defaults* on AMD64-AKA-"x86_64" GNU/Linux.

'*': [except for "--prefix"]


I`m sending the patch as an attachment to avoid it
being corrupted/reformatted by any e-mail troubles.

I look forward to your feedback.

Regards,

Abe


[-- Attachment #2: Abe_RTL_ifcvt_patch_1___2015_Oct._7_.patch --]
[-- Type: text/plain, Size: 7836 bytes --]

--- ifcvt.c	2015-09-01 12:54:38.234108158 -0500
+++ ifcvt.c	2015-10-07 11:14:11.606439645 -0500
@@ -47,6 +47,7 @@
 #include "insn-codes.h"
 #include "optabs.h"
 #include "diagnostic-core.h"
+#include "diagnostic-color.h"
 #include "tm_p.h"
 #include "cfgloop.h"
 #include "target.h"
@@ -56,6 +57,8 @@
 #include "rtl-iter.h"
 #include "ifcvt.h"
 
+#include <utility>
+
 #ifndef MAX_CONDITIONAL_EXECUTE
 #define MAX_CONDITIONAL_EXECUTE \
   (BRANCH_COST (optimize_function_for_speed_p (cfun), false) \
@@ -66,6 +69,9 @@
 
 #define NULL_BLOCK	((basic_block) NULL)
 
+/* An arbitrary inclusive maximum size (in bytes) for each scratchpad.  */
+#define SCRATCHPAD_MAX_SIZE 128
+
 /* True if after combine pass.  */
 static bool ifcvt_after_combine;
 
@@ -110,6 +116,8 @@ static int dead_or_predicable (basic_blo
 			       edge, int);
 static void noce_emit_move_insn (rtx, rtx);
 static rtx_insn *block_has_only_trap (basic_block);
+
+static auto_vec<std::pair<rtx, unsigned int> > scratchpads;
 \f
 /* Count the number of non-jump active insns in BB.  */
 
@@ -2784,19 +2792,16 @@ noce_operand_ok (const_rtx op)
   return ! may_trap_p (op);
 }
 
-/* Return true if a write into MEM may trap or fault.  */
+/* Return true if a write into MEM may trap or fault
+   even in the presence of scratchpad support.  */
 
 static bool
-noce_mem_write_may_trap_or_fault_p (const_rtx mem)
+noce_mem_write_may_trap_or_fault_p_1 (const_rtx mem)
 {
-  rtx addr;
-
   if (MEM_READONLY_P (mem))
     return true;
 
-  if (may_trap_or_fault_p (mem))
-    return true;
-
+  rtx addr;
   addr = XEXP (mem, 0);
 
   /* Call target hook to avoid the effects of -fpic etc....  */
@@ -2837,6 +2842,18 @@ noce_mem_write_may_trap_or_fault_p (cons
   return false;
 }
 
+/* Return true if a write into MEM may trap or fault
+   without scratchpad support.  */
+
+static bool
+noce_mem_write_may_trap_or_fault_p (const_rtx mem)
+{
+  if (may_trap_or_fault_p (mem))
+    return true;
+
+  return noce_mem_write_may_trap_or_fault_p_1 (mem);
+}
+
 /* Return whether we can use store speculation for MEM.  TOP_BB is the
    basic block above the conditional block where we are considering
    doing the speculative store.  We look for whether MEM is set
@@ -3156,17 +3173,149 @@ noce_process_if_block (struct noce_if_in
 
   if (!set_b && MEM_P (orig_x))
     {
-      /* Disallow the "if (...) x = a;" form (implicit "else x = x;")
-	 for optimizations if writing to x may trap or fault,
-	 i.e. it's a memory other than a static var or a stack slot,
-	 is misaligned on strict aligned machines or is read-only.  If
-	 x is a read-only memory, then the program is valid only if we
-	 avoid the store into it.  If there are stores on both the
-	 THEN and ELSE arms, then we can go ahead with the conversion;
-	 either the program is broken, or the condition is always
-	 false such that the other memory is selected.  */
+      /* Disallow the "if (...) x = a;" form (with no "else") for optimizations
+	 when x is misaligned on strict-alignment machines or is read-only.
+	 If x is a memory other than a static var or a stack slot: for targets
+	 _with_ conditional move and _without_ conditional execution,
+	 convert using the scratchpad technique, otherwise don`t convert.
+	 If x is a read-only memory, then the program is valid only if we avoid
+	 the store into it.  If there are stores on both the THEN and ELSE arms,
+	 then we can go ahead with the conversion; either the program is broken,
+	 or the condition is always false such that the other memory is selected.
+	 The non-scratchpad-based conversion here has an implicit "else x = x;".  */
       if (noce_mem_write_may_trap_or_fault_p (orig_x))
-	return FALSE;
+	{
+	  /* The next "if": quoting "noce_emit_cmove":
+	       If we can't create new pseudos, though, don't bother.  */
+	  if (reload_completed)
+	    return FALSE;
+
+	  if (optimize<2)
+	    return FALSE;
+
+	  if (optimize_function_for_size_p (cfun))
+	    return FALSE;
+
+	  if (targetm.have_conditional_execution () || ! HAVE_conditional_move)
+	    return FALSE;
+
+	  const bool not_a_scratchpad_candidate =
+	    noce_mem_write_may_trap_or_fault_p_1 (orig_x);
+
+	  if (! not_a_scratchpad_candidate)
+	  {
+	    if (MEM_SIZE_KNOWN_P (orig_x))
+	    {
+	      const size_t size_of_MEM = MEM_SIZE (orig_x);
+
+	      if (size_of_MEM <= SCRATCHPAD_MAX_SIZE)
+	      {
+		rtx biggest_scratchpad = 0;
+		unsigned int biggest_scratchpad_size = 0;
+		if (size_t vec_len = scratchpads.length ())
+		  {
+		    std::pair<rtx, unsigned> tmp_pair = scratchpads[vec_len-1];
+		    biggest_scratchpad = tmp_pair.first;
+		    biggest_scratchpad_size = tmp_pair.second;
+		  }
+		if (size_of_MEM > biggest_scratchpad_size)
+		{
+		  biggest_scratchpad_size = size_of_MEM;
+		  biggest_scratchpad = assign_stack_local
+		    (GET_MODE (orig_x), size_of_MEM, 0);
+		  gcc_assert (biggest_scratchpad);
+		  scratchpads.safe_push (std::make_pair (biggest_scratchpad,
+							 size_of_MEM));
+		}
+
+		gcc_assert (biggest_scratchpad);
+
+		rtx reg_for_store_addr = gen_reg_rtx (Pmode);
+		set_used_flags (reg_for_store_addr);
+
+		start_sequence ();
+
+		/* We must copy the insns between the start of the THEN block
+		   and the set of 'a', if they exist, since they may be needed
+		   for the converted code as well, but we must not copy a
+		   start-of-BB note if one is present, nor debug "insn"s.  */
+
+		for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
+		  && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
+		  {
+		     if (! (NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))
+		       duplicate_insn_chain (insn, insn);
+		       /* A return of 0 from "duplicate_insn_chain" is _not_
+			  a failure; it just returns the "NEXT_INSN" of the
+			  last insn it duplicated.  */
+		  }
+
+		/* Done copying the needed insns between the start of the
+		   THEN block and the set of 'a', if any.  */
+
+		if (CONSTANT_P (XEXP (cond, 0)) && CONSTANT_P (XEXP (cond, 1)))
+		  {
+		    end_sequence ();
+		    return FALSE;
+		  }
+
+		rtx target = noce_emit_cmove (if_info,
+					      reg_for_store_addr,
+					      GET_CODE (cond),
+					      XEXP (cond, 0),
+					      XEXP (cond, 1),
+					      XEXP (orig_x, 0),
+					      XEXP (biggest_scratchpad, 0));
+
+		if (!target)
+		  {
+		    end_sequence ();
+		    return FALSE;
+		  }
+		if (target != reg_for_store_addr)
+		  noce_emit_move_insn (reg_for_store_addr, target);
+
+		rtx mem = gen_rtx_MEM (GET_MODE (orig_x), reg_for_store_addr);
+		MEM_NOTRAP_P (mem) = true;
+		MEM_VOLATILE_P (mem) = MEM_VOLATILE_P (orig_x);
+
+		alias_set_type temp_alias_set = new_alias_set ();
+		if (MEM_ALIAS_SET (orig_x))
+		  record_alias_subset (MEM_ALIAS_SET (orig_x), temp_alias_set);
+		set_mem_alias_set (mem, temp_alias_set);
+
+		set_mem_align (mem,
+		  MIN (MEM_ALIGN (biggest_scratchpad), MEM_ALIGN (orig_x)));
+		if (MEM_ADDR_SPACE (orig_x)
+		 != MEM_ADDR_SPACE (biggest_scratchpad))
+		  {
+		    end_sequence ();
+		    return FALSE;
+		  }
+
+		set_used_flags (mem);
+
+		noce_emit_move_insn (mem, a);
+
+		rtx_insn *seq = end_ifcvt_sequence (if_info);
+		if (!seq)
+		  return FALSE;
+
+		unshare_all_rtl_in_chain (seq);
+
+		/* Prevent the code right after "success:"
+		   from throwing away the changes.  */
+		x = orig_x;
+
+		emit_insn_before_setloc (seq, if_info->jump,
+					 INSN_LOCATION (if_info->insn_a));
+		goto success;
+
+	      }
+	    }
+	  }
+	  return FALSE;
+	}
 
       /* Avoid store speculation: given "if (...) x = a" where x is a
 	 MEM, we only want to do the store if x is always set
@@ -4959,6 +5108,9 @@ if_convert (bool after_combine)
   basic_block bb;
   int pass;
 
+  /* Ensure that we start the scratchpads data fresh each time.  */
+  scratchpads.truncate (0);
+
   if (optimize == 1)
     {
       df_live_add_problem ();

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2015-10-30 14:08 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-13 20:05 using scratchpads to enhance RTL-level if-conversion: revised patch Abe
     [not found] <024301d11106$2379b5f0$6a6d21d0$@samsung.com>
2015-10-27 23:02 ` Abe
2015-10-30 14:09   ` Bernd Schmidt
  -- strict thread matches above, loose matches on Subject: below --
2015-10-07 23:29 Abe
2015-10-08 13:09 ` Sebastian Pop
2015-10-08 13:20   ` Sebastian Pop
2015-10-08 13:26     ` Bernd Schmidt
2015-10-08 13:23 ` Bernd Schmidt
2015-10-13 19:34   ` Abe
2015-10-13 20:16     ` Bernd Schmidt
2015-10-14 17:43       ` Jeff Law
2015-10-14 19:15         ` Bernd Schmidt
2015-10-15  8:52           ` Richard Biener
2015-10-20  5:52           ` Jeff Law
2015-10-20  9:37             ` Richard Biener
2015-10-14  1:05   ` Richard Henderson
2015-10-14  1:11     ` Richard Henderson
2015-10-14  8:29     ` Eric Botcazou
2015-10-14 17:46       ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).