public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* improved RTL-level if conversion using scratchpads [half-hammock edition]
@ 2015-11-05 23:43 Abe
  2015-11-06 10:56 ` Bernd Schmidt
  0 siblings, 1 reply; 11+ messages in thread
From: Abe @ 2015-11-05 23:43 UTC (permalink / raw)
  To: gcc-patches, Sebastian Pop, Bernd Schmidt, Kyrill Tkachov

[-- Attachment #1: Type: text/plain, Size: 1690 bytes --]

The attached improves on the relevant previous proposed patches mainly by sizing the scratchpad once for each routine being compiled so that
it never wastes an unneeded stack slot.  The size chosen is the "perfect" size, i.e. the size of the biggest scratchpad-converted type.
Also new this time: an override path for the allocation policy is provided as a target hook so that targets can use something other than
a normal stack slot for the scratchpad location.  It is left up to the target maintainers to implement e.g. redzone-based allocation.

Feedback from Bernd has also been applied.



2015-10-06  Abe Skolnik <a.skolnik@samsung.com>


         * ifcvt.c (find_if_header): Add a parameter.
         Return early when new parameter is true.
         (noce_find_if_block): Likewise.
         (noce_process_if_block): Likewise.
         (if_convert): Clear the scratchpad-size-tracking variable.
         Search for the best size for the scratchpad if one is applicable.
         (noce_process_if_block): Add scratchpad-based support for if-converting
         half hammocks with writes to destinations GCC says may "trap or fault".
         (unsafe_address_p): New.
         * invoke.texi (force-enable-rtl-ifcvt-spads): New.
         * params.def (force-enable-rtl-ifcvt-spads): New DEFPARAM.
         * targhooks.c (default_rtl_ifcvt_get_spad): New.
         * targhooks.h (default_rtl_ifcvt_get_spad): New.
         * target.def (rtl_ifcvt_scratchpad_control): New DEFHOOKPOD.
         (rtl_ifcvt_get_spad): New DEFHOOK.
         * target.h (rtl_ifcvt_spads_ctl_enum): New enum.



[patch posted as a plain-text attachment to avoid accidental formatting issues]



Regards,

Abe


[-- Attachment #2: Abe_RTL-ifcvt_patch_1___2015-11-5_004_.patch --]
[-- Type: text/plain, Size: 14378 bytes --]

From e70d66a5f2be1f2863bb56d21707b0be18010ad2 Mon Sep 17 00:00:00 2001
From: Abe <abe_skolnik@yahoo.com>
Date: Tue, 8 Sep 2015 17:00:38 -0500
Subject: [PATCH] Added support for RTL-level if-conversion of half hammocks
 with nonlocal writes.

---
 gcc/doc/invoke.texi |   4 ++
 gcc/ifcvt.c         | 198 +++++++++++++++++++++++++++++++++++++++++++++-------
 gcc/params.def      |   6 ++
 gcc/target.def      |  11 +++
 gcc/target.h        |   6 ++
 gcc/targhooks.c     |   4 ++
 gcc/targhooks.h     |   1 +
 7 files changed, 203 insertions(+), 27 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ebfaaa1..201ad16 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -11119,6 +11119,10 @@ automaton.  The default is 50.
 Chunk size of omp schedule for loops parallelized by parloops.  The default
 is 0.
 
+@item force-enable-rtl-ifcvt-spads
+Force-enable the use of scratchpads in RTL if conversion,
+overriding the target and the profile data or lack thereof.
+
 @end table
 @end table
 
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 7ab738e..6a25831 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -55,6 +55,9 @@
 #include "shrink-wrap.h"
 #include "rtl-iter.h"
 #include "ifcvt.h"
+#include "params.h"
+
+#include <utility>
 
 #ifndef MAX_CONDITIONAL_EXECUTE
 #define MAX_CONDITIONAL_EXECUTE \
@@ -100,9 +103,9 @@ static rtx noce_get_condition (rtx_insn *, rtx_insn **, bool);
 static int noce_operand_ok (const_rtx);
 static void merge_if_block (ce_if_block *);
 static int find_cond_trap (basic_block, edge, edge);
-static basic_block find_if_header (basic_block, int);
+static basic_block find_if_header (basic_block, int, bool);
 static int block_jumps_and_fallthru_p (basic_block, basic_block);
-static int noce_find_if_block (basic_block, edge, edge, int);
+static int noce_find_if_block (basic_block, edge, edge, int, bool);
 static int cond_exec_find_if_block (ce_if_block *);
 static int find_if_case_1 (basic_block, edge, edge);
 static int find_if_case_2 (basic_block, edge, edge);
@@ -110,6 +113,11 @@ static int dead_or_predicable (basic_block, basic_block, basic_block,
 			       edge, int);
 static void noce_emit_move_insn (rtx, rtx);
 static rtx_insn *block_has_only_trap (basic_block);
+
+/* An arbitrary inclusive maximum size (in bytes) for each scratchpad.  */
+#define SCRATCHPAD_MAX_SIZE 128
+static unsigned short spad_sz;
+static rtx spad;
 \f
 /* Count the number of non-jump active insns in BB.  */
 
@@ -2828,19 +2836,13 @@ noce_operand_ok (const_rtx op)
   return ! may_trap_p (op);
 }
 
-/* Return true if a write into MEM may trap or fault.  */
-
 static bool
 noce_mem_write_may_trap_or_fault_p (const_rtx mem)
 {
-  rtx addr;
-
   if (MEM_READONLY_P (mem))
     return true;
 
-  if (may_trap_or_fault_p (mem))
-    return true;
-
+  rtx addr;
   addr = XEXP (mem, 0);
 
   /* Call target hook to avoid the effects of -fpic etc....  */
@@ -2881,6 +2883,18 @@ noce_mem_write_may_trap_or_fault_p (const_rtx mem)
   return false;
 }
 
+/* Return true if a write into MEM may trap or fault
+   without scratchpad support.  */
+
+static bool
+unsafe_address_p (const_rtx mem)
+{
+  if (may_trap_or_fault_p (mem))
+    return true;
+
+  return noce_mem_write_may_trap_or_fault_p (mem);
+}
+
 /* Return whether we can use store speculation for MEM.  TOP_BB is the
    basic block above the conditional block where we are considering
    doing the speculative store.  We look for whether MEM is set
@@ -3031,8 +3045,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
    at converting the block.  */
 
 static int
-noce_process_if_block (struct noce_if_info *if_info)
+noce_process_if_block (struct noce_if_info *if_info, edge then_edge,
+		       bool just_sz_spad)
 {
+
   basic_block test_bb = if_info->test_bb;	/* test block */
   basic_block then_bb = if_info->then_bb;	/* THEN */
   basic_block else_bb = if_info->else_bb;	/* ELSE or NULL */
@@ -3047,7 +3063,7 @@ noce_process_if_block (struct noce_if_info *if_info)
 
      (1) if (...) x = a; else x = b;
      (2) x = b; if (...) x = a;
-     (3) if (...) x = a;   // as if with an initial x = x.
+     (3) if (...) x = a;
 
      The later patterns require jumps to be more expensive.
      For the if (...) x = a; else x = b; case we allow multiple insns
@@ -3200,17 +3216,119 @@ noce_process_if_block (struct noce_if_info *if_info)
 
   if (!set_b && MEM_P (orig_x))
     {
-      /* Disallow the "if (...) x = a;" form (implicit "else x = x;")
-	 for optimizations if writing to x may trap or fault,
-	 i.e. it's a memory other than a static var or a stack slot,
-	 is misaligned on strict aligned machines or is read-only.  If
-	 x is a read-only memory, then the program is valid only if we
-	 avoid the store into it.  If there are stores on both the
-	 THEN and ELSE arms, then we can go ahead with the conversion;
-	 either the program is broken, or the condition is always
-	 false such that the other memory is selected.  */
-      if (noce_mem_write_may_trap_or_fault_p (orig_x))
-	return FALSE;
+      /* Disallow the "if (...) x = a;" form (with no "else") for optimizations
+	 when x is misaligned on strict-alignment machines or is read-only.
+	 If x is a memory other than a static var or a stack slot: for targets
+	 _with_ conditional move and _without_ conditional execution,
+	 convert using the scratchpad technique, otherwise do not convert.
+	 If x is a read-only memory, then the program is valid only if we avoid
+	 the store into it.  If there are stores on both the THEN and ELSE arms,
+	 then we can go ahead with the conversion; either the program is broken,
+	 or the condition is always false such that the other memory is selected.
+	 The non-scratchpad-based conversion here has an implicit "else x = x;".  */
+      if (unsafe_address_p (orig_x))
+	{
+	  if (reload_completed
+	      || optimize < 2
+	      || optimize_function_for_size_p (cfun)
+	      || targetm.have_conditional_execution ()
+	      || !HAVE_conditional_move
+	      || (CONSTANT_P (XEXP (cond, 0)) && CONSTANT_P (XEXP (cond, 1)))
+	      || (! PARAM_VALUE (PARAM_FORCE_ENABLE_RTL_IFCVT_SPADS)
+	          && rtl_ifcvt_spads_never
+	               == targetm.rtl_ifcvt_scratchpad_control))
+	    return FALSE;
+
+
+	  if ((! PARAM_VALUE (PARAM_FORCE_ENABLE_RTL_IFCVT_SPADS))
+	      && rtl_ifcvt_spads_as_per_profile
+		   == targetm.rtl_ifcvt_scratchpad_control
+	      && (PROFILE_ABSENT == profile_status_for_fn (cfun)
+		  || PROFILE_GUESSED == profile_status_for_fn (cfun)
+		  || predictable_edge_p (then_edge)
+		  || ! maybe_hot_bb_p (cfun, then_bb)))
+	    return FALSE;
+
+	  if (noce_mem_write_may_trap_or_fault_p (orig_x)
+	      || !MEM_SIZE_KNOWN_P (orig_x))
+	    return FALSE;
+
+	  const size_t mem_sz = MEM_SIZE (orig_x);
+	  if (mem_sz > SCRATCHPAD_MAX_SIZE)
+	    return FALSE;
+
+	  if (mem_sz > spad_sz)
+	      spad_sz = mem_sz;
+
+	  /* Not FALSE because it is not a failure to convert.  */
+	  if (just_sz_spad)
+	    return 0;
+
+	  gcc_assert (spad);
+
+	  rtx reg_for_store_addr = gen_reg_rtx (Pmode);
+	  set_used_flags (reg_for_store_addr);
+
+	  start_sequence ();
+
+	  for (rtx_insn *insn = BB_HEAD (then_bb);
+	       insn && insn != insn_a && insn != BB_END (then_bb);
+	       insn = NEXT_INSN (insn))
+	    if (!(NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))
+	      duplicate_insn_chain (insn, insn);
+
+	  rtx target = noce_emit_cmove (if_info,
+					reg_for_store_addr,
+					GET_CODE (cond),
+					XEXP (cond, 0),
+					XEXP (cond, 1),
+					XEXP (orig_x, 0),
+					XEXP (spad, 0));
+
+	  if (!target)
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+	  if (target != reg_for_store_addr)
+	    noce_emit_move_insn (reg_for_store_addr, target);
+
+	  rtx mem = gen_rtx_MEM (GET_MODE (orig_x), reg_for_store_addr);
+	  MEM_NOTRAP_P (mem) = MEM_NOTRAP_P (orig_x);
+	  MEM_VOLATILE_P (mem) = MEM_VOLATILE_P (orig_x);
+
+	  alias_set_type temp_alias_set = new_alias_set ();
+	  if (MEM_ALIAS_SET (orig_x))
+	    record_alias_subset (MEM_ALIAS_SET (orig_x), temp_alias_set);
+	  set_mem_alias_set (mem, temp_alias_set);
+
+	  set_mem_align (mem, MIN (MEM_ALIGN (spad),
+				   MEM_ALIGN (orig_x)));
+	  if (MEM_ADDR_SPACE (orig_x) != MEM_ADDR_SPACE (spad))
+	    {
+	      end_sequence ();
+	      return FALSE;
+	    }
+
+	  set_used_flags (mem);
+
+	  noce_emit_move_insn (mem, a);
+
+	  rtx_insn *seq = end_ifcvt_sequence (if_info);
+	  if (!seq)
+	    return FALSE;
+
+	  unshare_all_rtl_in_chain (seq);
+
+	  /* Prevent the code right after "success:"
+	     from throwing away the changes.  */
+	  x = orig_x;
+
+	  emit_insn_before_setloc (seq, if_info->jump,
+				   INSN_LOCATION (if_info->insn_a));
+	  goto success;
+
+	}
 
       /* Avoid store speculation: given "if (...) x = a" where x is a
 	 MEM, we only want to do the store if x is always set
@@ -3224,6 +3342,10 @@ noce_process_if_block (struct noce_if_info *if_info)
 	return FALSE;
     }
 
+  /* Not FALSE because it is not a failure to convert.  */
+  if (just_sz_spad)
+    return 0;
+
   if (noce_try_move (if_info))
     goto success;
   if (noce_try_store_flag (if_info))
@@ -3592,7 +3714,7 @@ done:
 
 static int
 noce_find_if_block (basic_block test_bb, edge then_edge, edge else_edge,
-		    int pass)
+		    int pass, bool just_sz_spad)
 {
   basic_block then_bb, else_bb, join_bb;
   bool then_else_reversed = false;
@@ -3696,9 +3818,13 @@ noce_find_if_block (basic_block test_bb, edge then_edge, edge else_edge,
 
   /* Do the real work.  */
 
-  if (noce_process_if_block (&if_info))
+  if (noce_process_if_block (&if_info, then_edge, just_sz_spad))
     return TRUE;
 
+  /* Not FALSE because it is not a failure to convert.  */
+  if (just_sz_spad)
+    return 0;
+
   if (HAVE_conditional_move
       && cond_move_process_if_block (&if_info))
     return TRUE;
@@ -3853,7 +3979,7 @@ merge_if_block (struct ce_if_block * ce_info)
    first block if some transformation was done.  Return NULL otherwise.  */
 
 static basic_block
-find_if_header (basic_block test_bb, int pass)
+find_if_header (basic_block test_bb, int pass, bool just_sz_spad)
 {
   ce_if_block ce_info;
   edge then_edge;
@@ -3901,9 +4027,11 @@ find_if_header (basic_block test_bb, int pass)
 #endif
 
   if (!reload_completed
-      && noce_find_if_block (test_bb, then_edge, else_edge, pass))
+      && noce_find_if_block (test_bb, then_edge, else_edge, pass, just_sz_spad))
     goto success;
 
+  if (just_sz_spad)  return 0; /* Not FALSE b/c it`s not a failure to convert.  */
+
   if (reload_completed
       && targetm.have_conditional_execution ()
       && cond_exec_find_if_block (&ce_info))
@@ -5003,6 +5131,8 @@ if_convert (bool after_combine)
   basic_block bb;
   int pass;
 
+  spad_sz = 0;
+
   if (optimize == 1)
     {
       df_live_add_problem ();
@@ -5027,6 +5157,20 @@ if_convert (bool after_combine)
 
   df_set_flags (DF_LR_RUN_DCE);
 
+  if (HAVE_conditional_move
+      && (! targetm.have_conditional_execution ())
+      && (PARAM_VALUE (PARAM_FORCE_ENABLE_RTL_IFCVT_SPADS)
+          || rtl_ifcvt_spads_never
+	       != targetm.rtl_ifcvt_scratchpad_control))
+    {
+      FOR_EACH_BB_FN (bb, cfun)
+	/* We do not care about the return value in this case.  */
+	find_if_header (bb, 0, true);
+
+      if (spad_sz)
+	spad = targetm.rtl_ifcvt_get_spad (spad_sz);
+    }
+
   /* Go through each of the basic blocks looking for things to convert.  If we
      have conditional execution, we make multiple passes to allow us to handle
      IF-THEN{-ELSE} blocks within other IF-THEN{-ELSE} blocks.  */
@@ -5048,7 +5192,7 @@ if_convert (bool after_combine)
 	{
           basic_block new_bb;
           while (!df_get_bb_dirty (bb)
-                 && (new_bb = find_if_header (bb, pass)) != NULL)
+                 && (new_bb = find_if_header (bb, pass, false)) != NULL)
             bb = new_bb;
 	}
 
diff --git a/gcc/params.def b/gcc/params.def
index 3f91992..05f0e5e 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -254,6 +254,12 @@ DEFPARAM(PARAM_GCSE_UNRESTRICTED_COST,
 	 "Cost at which GCSE optimizations will not constraint the distance an expression can travel",
 	 3, 0, 0)
 
+DEFPARAM (PARAM_FORCE_ENABLE_RTL_IFCVT_SPADS,
+	  "force-enable-rtl-ifcvt-spads",
+	  "Force-enable the use of scratchpads in RTL if conversion, "
+	  "overriding the target and the profile data or lack thereof.",
+	  0, 0, 1)
+
 /* How deep from a given basic block the dominator tree should be searched
    for expressions to hoist to the block.  The value of 0 will avoid limiting
    the search.  */
diff --git a/gcc/target.def b/gcc/target.def
index d29aad5..0cb5418 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -5915,6 +5915,17 @@ HOOK_VECTOR_END (mode_switching)
 #include "target-insns.def"
 #undef DEF_TARGET_INSN
 
+DEFHOOKPOD
+(rtl_ifcvt_scratchpad_control,
+"*",
+enum rtl_ifcvt_spads_ctl_enum, rtl_ifcvt_spads_as_per_profile)
+
+DEFHOOK
+(rtl_ifcvt_get_spad,
+ "*",
+ rtx, (unsigned short size),
+ default_rtl_ifcvt_get_spad)
+
 /* Close the 'struct gcc_target' definition.  */
 HOOK_VECTOR_END (C90_EMPTY_HACK)
 
diff --git a/gcc/target.h b/gcc/target.h
index a79f424..a65dfbd 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -187,6 +187,12 @@ enum vect_cost_model_location {
 #define DEFHOOK_UNDOC DEFHOOK
 #define HOOKSTRUCT(FRAGMENT) FRAGMENT
 
+enum rtl_ifcvt_spads_ctl_enum {
+  rtl_ifcvt_spads_never = 0,
+  rtl_ifcvt_spads_always,
+  rtl_ifcvt_spads_as_per_profile
+};
+
 #include "target.def"
 
 extern struct gcc_target targetm;
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 7238c8f..0a55485 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1922,4 +1922,8 @@ can_use_doloop_if_innermost (const widest_int &, const widest_int &,
   return loop_depth == 1;
 }
 
+rtx default_rtl_ifcvt_get_spad (unsigned short size)
+{
+  return assign_stack_local (BLKmode, size, 0);
+}
 #include "gt-targhooks.h"
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 77c284a..b966c6d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -243,4 +243,5 @@ extern void default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE
 						  tree type ATTRIBUTE_UNUSED,
 						  int *pretend_arg_size ATTRIBUTE_UNUSED,
 						  int second_time ATTRIBUTE_UNUSED);
+extern rtx default_rtl_ifcvt_get_spad (unsigned short size);
 #endif /* GCC_TARGHOOKS_H */
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-05 23:43 improved RTL-level if conversion using scratchpads [half-hammock edition] Abe
@ 2015-11-06 10:56 ` Bernd Schmidt
  2015-11-06 14:11   ` Sebastian Pop
       [not found]   ` <563CDCAF.20703@yahoo.com>
  0 siblings, 2 replies; 11+ messages in thread
From: Bernd Schmidt @ 2015-11-06 10:56 UTC (permalink / raw)
  To: Abe, gcc-patches, Sebastian Pop, Kyrill Tkachov

On 11/06/2015 12:43 AM, Abe wrote:
> Feedback from Bernd has also been applied.

But inconsistently, and I think not quite in the way I meant it in one case.

> -/* Return true if a write into MEM may trap or fault.  */
> -
>   static bool
>   noce_mem_write_may_trap_or_fault_p (const_rtx mem)
>   {
> -  rtx addr;
> -
>     if (MEM_READONLY_P (mem))
>       return true;
>
> -  if (may_trap_or_fault_p (mem))
> -    return true;
> -
> +  rtx addr;
>     addr = XEXP (mem, 0);
>
>     /* Call target hook to avoid the effects of -fpic etc....  */
> @@ -2881,6 +2883,18 @@ noce_mem_write_may_trap_or_fault_p (const_rtx mem)
>     return false;
>   }
>
> +/* Return true if a write into MEM may trap or fault
> +   without scratchpad support.  */
> +
> +static bool
> +unsafe_address_p (const_rtx mem)
> +{
> +  if (may_trap_or_fault_p (mem))
> +    return true;
> +
> +  return noce_mem_write_may_trap_or_fault_p (mem);
> +}

The naming seems backwards from what I suggested in terms of naming. You 
still haven't explained why you want to modify this function, or call 
the limited one even when generating scratchpads. I asked about this 
last time.

>   static basic_block
> -find_if_header (basic_block test_bb, int pass)
> +find_if_header (basic_block test_bb, int pass, bool just_sz_spad)
>   {

Arguments need documentation.

> +DEFPARAM (PARAM_FORCE_ENABLE_RTL_IFCVT_SPADS,
> +	  "force-enable-rtl-ifcvt-spads",
> +	  "Force-enable the use of scratchpads in RTL if conversion, "
> +	  "overriding the target and the profile data or lack thereof.",
> +	  0, 0, 1)
> +
>
> +DEFHOOKPOD
> +(rtl_ifcvt_scratchpad_control,
> +"*",
> +enum rtl_ifcvt_spads_ctl_enum, rtl_ifcvt_spads_as_per_profile)
> +
> +DEFHOOK
> +(rtl_ifcvt_get_spad,
> + "*",
> + rtx, (unsigned short size),
> + default_rtl_ifcvt_get_spad)

That moves the problematic bit in a target hook rather than fixing it. 
Two target hooks and a param is probably a bit much for a change like 
this. Which target do you actually want this for? It would help to 
understand why you're doing all this.

> +enum rtl_ifcvt_spads_ctl_enum {

Names are still too verbose ("enum" shouldn't be part of it).

> +rtx default_rtl_ifcvt_get_spad (unsigned short size)
> +{
> +  return assign_stack_local (BLKmode, size, 0);
> +}

Formatting problem, here and in a few other places. I didn't fully read 
the patch this time around.

I'm probably not reviewing further patches because I don't see this 
progressing to a state where it's acceptable. Others may do so, but as 
far as I'm concerned the patch is rejected.


Bernd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-06 10:56 ` Bernd Schmidt
@ 2015-11-06 14:11   ` Sebastian Pop
  2015-11-06 14:32     ` Bernd Schmidt
  2015-11-06 18:43     ` Jeff Law
       [not found]   ` <563CDCAF.20703@yahoo.com>
  1 sibling, 2 replies; 11+ messages in thread
From: Sebastian Pop @ 2015-11-06 14:11 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Abe, gcc-patches, Kyrill Tkachov

On Fri, Nov 6, 2015 at 2:56 AM, Bernd Schmidt <bschmidt@redhat.com> wrote:
> Formatting problem, here and in a few other places. I didn't fully read the
> patch this time around.
>
> I'm probably not reviewing further patches because I don't see this
> progressing to a state where it's acceptable. Others may do so, but as far
> as I'm concerned the patch is rejected.

Bernd,
I would like to ask you to focus on the technical part, and provide a
review only based on technical reasons.
Please ignore all formatting changes: I will help address all those changes.
I will send a patch addressing all the comments you had in the current review.

Thanks,
Sebastian

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-06 14:11   ` Sebastian Pop
@ 2015-11-06 14:32     ` Bernd Schmidt
  2015-11-06 15:52       ` Sebastian Pop
  2015-11-06 18:43     ` Jeff Law
  1 sibling, 1 reply; 11+ messages in thread
From: Bernd Schmidt @ 2015-11-06 14:32 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Abe, gcc-patches, Kyrill Tkachov

On 11/06/2015 03:10 PM, Sebastian Pop wrote:
> On Fri, Nov 6, 2015 at 2:56 AM, Bernd Schmidt <bschmidt@redhat.com> wrote:
>> Formatting problem, here and in a few other places. I didn't fully read the
>> patch this time around.
>>
>> I'm probably not reviewing further patches because I don't see this
>> progressing to a state where it's acceptable. Others may do so, but as far
>> as I'm concerned the patch is rejected.
>
> Bernd,
> I would like to ask you to focus on the technical part, and provide a
> review only based on technical reasons.
> Please ignore all formatting changes: I will help address all those changes.
> I will send a patch addressing all the comments you had in the current review.

As long as this just has allocation from the normal stack frame as its 
only strategy, I consider it unacceptable (and I think Richard B voiced 
the same opinion). If you want a half-finished redzone allocator, I can 
send you a patch.


Bernd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-06 14:32     ` Bernd Schmidt
@ 2015-11-06 15:52       ` Sebastian Pop
  2015-11-06 16:28         ` Bernd Schmidt
  0 siblings, 1 reply; 11+ messages in thread
From: Sebastian Pop @ 2015-11-06 15:52 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Abe, gcc-patches, Kyrill Tkachov

On Fri, Nov 6, 2015 at 6:32 AM, Bernd Schmidt <bschmidt@redhat.com> wrote:
> On 11/06/2015 03:10 PM, Sebastian Pop wrote:
>>
>> On Fri, Nov 6, 2015 at 2:56 AM, Bernd Schmidt <bschmidt@redhat.com> wrote:
>>>
>>> Formatting problem, here and in a few other places. I didn't fully read
>>> the
>>> patch this time around.
>>>
>>> I'm probably not reviewing further patches because I don't see this
>>> progressing to a state where it's acceptable. Others may do so, but as
>>> far
>>> as I'm concerned the patch is rejected.
>>
>>
>> Bernd,
>> I would like to ask you to focus on the technical part, and provide a
>> review only based on technical reasons.
>> Please ignore all formatting changes: I will help address all those
>> changes.
>> I will send a patch addressing all the comments you had in the current
>> review.
>
>
> As long as this just has allocation from the normal stack frame as its only
> strategy, I consider it unacceptable (and I think Richard B voiced the same

Understood.

> opinion). If you want a half-finished redzone allocator, I can send you a
> patch.

Yes please.  Let's get it work.

Thanks,
Sebastian

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-06 15:52       ` Sebastian Pop
@ 2015-11-06 16:28         ` Bernd Schmidt
  2015-11-07 12:02           ` Bernhard Reutner-Fischer
  0 siblings, 1 reply; 11+ messages in thread
From: Bernd Schmidt @ 2015-11-06 16:28 UTC (permalink / raw)
  To: Sebastian Pop, Bernd Schmidt; +Cc: Abe, gcc-patches, Kyrill Tkachov

[-- Attachment #1: Type: text/plain, Size: 1125 bytes --]

On 11/06/2015 04:52 PM, Sebastian Pop wrote:

>> opinion). If you want a half-finished redzone allocator, I can send you a
>> patch.
>
> Yes please.  Let's get it work.

Here you go. This is incomplete and does not compile, but it shows the 
direction I have in mind and isn't too far off. I had a similar patch 
once for a machine that had two stack pointers (don't ask), and I 
started to recreate that for the given problem last week.

The temp slot handling code in function.c needs more frame arguments, 
but I got halfway through them and started wondering whether they should 
be member functions of struct frame_info instead.

The bits in cfgexpand and function, once complete, are essentially all 
that's necessary to support a second frame, but for this to work as a 
redzone allocator it needs to be integrated with target (i.e. i386) 
frame layout code. For purposes of optimizing we may also want to 
establish a maximum frame size for the rz_frame.

Bonus points if reload/lra use this for spilled pseudos that don't live 
across calls, but I can have a go at that if you don't feel like 
tackling it.


Bernd


[-- Attachment #2: rzalloc.diff --]
[-- Type: text/x-patch, Size: 21435 bytes --]

diff --git a/gcc/caller-save.c b/gcc/caller-save.c
index 084d079..c3a5256 100644
--- a/gcc/caller-save.c
+++ b/gcc/caller-save.c
@@ -654,7 +654,7 @@ setup_save_areas (void)
 		{
 		  saved_reg->slot
 		    = assign_stack_local_1
-		      (regno_save_mode[regno][1],
+		      (&crtl->frame, regno_save_mode[regno][1],
 		       GET_MODE_SIZE (regno_save_mode[regno][1]), 0,
 		       ASLK_REDUCE_ALIGN);
 		  if (dump_file != NULL)
@@ -712,7 +712,8 @@ setup_save_areas (void)
 	       when we restore and save the hard register in
 	       insert_restore and insert_save.  */
 	    regno_save_mem[i][j]
-	      = assign_stack_local_1 (regno_save_mode[i][j],
+	      = assign_stack_local_1 (&crtl->frame,
+				      regno_save_mode[i][j],
 				      GET_MODE_SIZE (regno_save_mode[i][j]),
 				      0, ASLK_REDUCE_ALIGN);
 
diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index bfbc958..8825217 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -335,11 +335,6 @@ static bitmap_obstack stack_var_bitmap_obstack;
    is non-decreasing.  */
 static size_t *stack_vars_sorted;
 
-/* The phase of the stack frame.  This is the known misalignment of
-   virtual_stack_vars_rtx from PREFERRED_STACK_BOUNDARY.  That is,
-   (frame_offset+frame_phase) % PREFERRED_STACK_BOUNDARY == 0.  */
-static int frame_phase;
-
 /* Used during expand_used_vars to remember if we saw any decls for
    which we'd like to enable stack smashing protection.  */
 static bool has_protected_decls;
@@ -375,32 +370,34 @@ align_base (HOST_WIDE_INT base, unsigned HOST_WIDE_INT align, bool align_up)
   return align_up ? (base + align - 1) & -align : base & -align;
 }
 
-/* Allocate SIZE bytes at byte alignment ALIGN from the stack frame.
+/* Allocate SIZE bytes at byte alignment ALIGN from the stack frame FRAME.
    Return the frame offset.  */
 
 static HOST_WIDE_INT
-alloc_stack_frame_space (HOST_WIDE_INT size, unsigned HOST_WIDE_INT align)
+alloc_stack_frame_space (frame_info *frame, HOST_WIDE_INT size,
+			 unsigned HOST_WIDE_INT align)
 {
   HOST_WIDE_INT offset, new_frame_offset;
 
-  if (FRAME_GROWS_DOWNWARD)
+  if (frame->grows_downward)
     {
-      new_frame_offset
-	= align_base (frame_offset - frame_phase - size,
-		      align, false) + frame_phase;
+      new_frame_offset = align_base (frame->frame_offset - frame->phase - size,
+				     align, false);
+      new_frame_offset += frame->phase;
       offset = new_frame_offset;
     }
   else
     {
-      new_frame_offset
-	= align_base (frame_offset - frame_phase, align, true) + frame_phase;
+      new_frame_offset = align_base (frame->frame_offset - frame->phase,
+				     align, true);
+      new_frame_offset += frame->phase;
       offset = new_frame_offset;
       new_frame_offset += size;
     }
-  frame_offset = new_frame_offset;
+  frame->frame_offset = new_frame_offset;
 
-  if (frame_offset_overflow (frame_offset, cfun->decl))
-    frame_offset = offset = 0;
+  if (frame_offset_overflow (frame->frame_offset, cfun->decl))
+    frame->frame_offset = offset = 0;
 
   return offset;
 }
@@ -965,11 +962,11 @@ dump_stack_var_partition (void)
     }
 }
 
-/* Assign rtl to DECL at BASE + OFFSET.  */
+/* Assign rtl to DECL at BASE + OFFSET in frame FRAME.  */
 
 static void
-expand_one_stack_var_at (tree decl, rtx base, unsigned base_align,
-			 HOST_WIDE_INT offset)
+expand_one_stack_var_at (frame_info *frame, tree decl, rtx base,
+			 unsigned base_align, HOST_WIDE_INT offset)
 {
   unsigned align;
   rtx x;
@@ -988,7 +985,7 @@ expand_one_stack_var_at (tree decl, rtx base, unsigned base_align,
          If it is we generate stack slots only accidentally so it isn't as
 	 important, we'll simply use the alignment that is already set.  */
       if (base == virtual_stack_vars_rtx)
-	offset -= frame_phase;
+	offset -= frame->phase;
       align = offset & -offset;
       align *= BITS_PER_UNIT;
       if (align == 0 || align > base_align)
@@ -1120,7 +1117,7 @@ expand_stack_vars (bool (*pred) (size_t), struct stack_vars_data *data)
 			      FRAME_GROWS_DOWNWARD);
 	      tree repr_decl = NULL_TREE;
 	      offset
-		= alloc_stack_frame_space (stack_vars[i].size
+		= alloc_stack_frame_space (&crtl->frame, stack_vars[i].size
 					   + ASAN_RED_ZONE_SIZE,
 					   MAX (alignb, ASAN_RED_ZONE_SIZE));
 
@@ -1157,7 +1154,8 @@ expand_stack_vars (bool (*pred) (size_t), struct stack_vars_data *data)
 	    }
 	  else
 	    {
-	      offset = alloc_stack_frame_space (stack_vars[i].size, alignb);
+	      offset = alloc_stack_frame_space (&crtl->frame,
+						stack_vars[i].size, alignb);
 	      base_align = crtl->max_used_stack_slot_alignment;
 	    }
 	}
@@ -1181,9 +1179,8 @@ expand_stack_vars (bool (*pred) (size_t), struct stack_vars_data *data)
 	 partition.  */
       for (j = i; j != EOC; j = stack_vars[j].next)
 	{
-	  expand_one_stack_var_at (stack_vars[j].decl,
-				   base, base_align,
-				   offset);
+	  expand_one_stack_var_at (&crtl->frame, stack_vars[j].decl,
+				   base, base_align, offset);
 	}
     }
 
@@ -1276,9 +1273,9 @@ expand_one_stack_var_1 (tree var)
   /* We handle highly aligned variables in expand_stack_vars.  */
   gcc_assert (byte_align * BITS_PER_UNIT <= MAX_SUPPORTED_STACK_ALIGNMENT);
 
-  offset = alloc_stack_frame_space (size, byte_align);
+  offset = alloc_stack_frame_space (&crtl->frame, size, byte_align);
 
-  expand_one_stack_var_at (var, virtual_stack_vars_rtx,
+  expand_one_stack_var_at (&crtl->frame, var, virtual_stack_vars_rtx,
 			   crtl->max_used_stack_slot_alignment, offset);
 }
 
@@ -1995,13 +1992,6 @@ expand_used_vars (void)
   unsigned len;
   bool gen_stack_protect_signal = false;
 
-  /* Compute the phase of the stack frame for this function.  */
-  {
-    int align = PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT;
-    int off = STARTING_FRAME_OFFSET % align;
-    frame_phase = off ? align - off : 0;
-  }
-
   /* Set TREE_USED on all variables in the local_decls.  */
   FOR_EACH_LOCAL_DECL (cfun, i, var)
     TREE_USED (var) = 1;
@@ -2197,12 +2187,13 @@ expand_used_vars (void)
 	    redzonesz = ((sz + ASAN_RED_ZONE_SIZE + data.asan_alignb - 1)
 			 & ~(data.asan_alignb - HOST_WIDE_INT_1)) - sz;
 	  offset
-	    = alloc_stack_frame_space (redzonesz, ASAN_RED_ZONE_SIZE);
+	    = alloc_stack_frame_space (&crtl->frame, redzonesz, ASAN_RED_ZONE_SIZE);
 	  data.asan_vec.safe_push (prev_offset);
 	  data.asan_vec.safe_push (offset);
 	  /* Leave space for alignment if STRICT_ALIGNMENT.  */
 	  if (STRICT_ALIGNMENT)
-	    alloc_stack_frame_space ((GET_MODE_ALIGNMENT (SImode)
+	    alloc_stack_frame_space (&crtl->frame,
+				     (GET_MODE_ALIGNMENT (SImode)
 				      << ASAN_SHADOW_SHIFT)
 				     / BITS_PER_UNIT, 1);
 
diff --git a/gcc/emit-rtl.h b/gcc/emit-rtl.h
index f52c335..8f90fd0 100644
--- a/gcc/emit-rtl.h
+++ b/gcc/emit-rtl.h
@@ -23,7 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 struct temp_slot;
 typedef struct temp_slot *temp_slot_p;
 
-/* Information mainlined about RTL representation of incoming arguments.  */
+/* Information maintained about RTL representation of incoming arguments.  */
 struct GTY(()) incoming_args {
   /* Number of bytes of args popped by function being compiled on its return.
      Zero if no bytes are to be popped.
@@ -52,6 +52,33 @@ struct GTY(()) incoming_args {
   rtx internal_arg_pointer;
 };
 
+/* Information maintained about layout and size of a stack frame.  */
+struct GTY(()) frame_info {
+  /* Offset to end of allocated area of stack frame.
+     If stack grows down, this is the address of the last stack slot allocated.
+     If stack grows up, this is the address for the next slot.  */
+  HOST_WIDE_INT frame_offset;
+
+  /* List of all used temporaries allocated, by level.  */
+  VEC(temp_slot_p,gc) *used_temp_slots;
+
+  /* List of available temp slots.  */
+  struct temp_slot *avail_temp_slots;
+
+  /* The phase of the frame offset.  */
+  int phase;
+
+  /* List of empty areas in the stack frame.  */
+  struct frame_space *frame_space_list;
+
+  /* The base register to be used for this frame.  */
+  rtx base;
+
+  /* True if FRAME_GROWS_DOWNWARD (or any similar definition)
+     applies to this particular frame.  */
+  bool grows_downward;
+};
+
 
 /* Datastructures maintained for currently processed function in RTL form.  */
 struct GTY(()) rtl_data {
@@ -106,12 +133,15 @@ struct GTY(()) rtl_data {
      Made for the sake of unshare_all_rtl.  */
   rtx_expr_list *x_stack_slot_list;
 
-  /* List of empty areas in the stack frame.  */
-  struct frame_space *frame_space_list;
-
   /* Place after which to insert the tail_recursion_label if we need one.  */
   rtx_note *x_stack_check_probe_note;
 
+  /* Description of the regular stack frame.  */
+  frame_info *frame;
+
+  /* Description of the redzone stack frame.  */
+  frame_info *rz_frame;
+
   /* Location at which to save the argument pointer if it will need to be
      referenced.  There are two cases where this is done: if nonlocal gotos
      exist, or if vars stored at an offset from the argument pointer will be
@@ -121,23 +151,9 @@ struct GTY(()) rtl_data {
   /* Dynamic Realign Argument Pointer used for realigning stack.  */
   rtx drap_reg;
 
-  /* Offset to end of allocated area of stack frame.
-     If stack grows down, this is the address of the last stack slot allocated.
-     If stack grows up, this is the address for the next slot.  */
-  HOST_WIDE_INT x_frame_offset;
-
   /* Insn after which register parms and SAVE_EXPRs are born, if nonopt.  */
   rtx_insn *x_parm_birth_insn;
 
-  /* List of all used temporaries allocated, by level.  */
-  vec<temp_slot_p, va_gc> *x_used_temp_slots;
-
-  /* List of available temp slots.  */
-  struct temp_slot *x_avail_temp_slots;
-
-  /* Current nesting level for temporaries.  */
-  int x_temp_slot_level;
-
   /* The largest alignment needed on the stack, including requirement
      for outgoing stack alignment.  */
   unsigned int stack_alignment_needed;
@@ -290,11 +306,8 @@ struct GTY(()) rtl_data {
 #define naked_return_label (crtl->x_naked_return_label)
 #define stack_slot_list (crtl->x_stack_slot_list)
 #define parm_birth_insn (crtl->x_parm_birth_insn)
-#define frame_offset (crtl->x_frame_offset)
 #define stack_check_probe_note (crtl->x_stack_check_probe_note)
 #define arg_pointer_save_area (crtl->x_arg_pointer_save_area)
-#define used_temp_slots (crtl->x_used_temp_slots)
-#define avail_temp_slots (crtl->x_avail_temp_slots)
 #define temp_slot_level (crtl->x_temp_slot_level)
 #define nonlocal_goto_handler_labels (crtl->x_nonlocal_goto_handler_labels)
 #define frame_pointer_needed (crtl->frame_pointer_needed)
diff --git a/gcc/function.c b/gcc/function.c
index a637cb3..9aec731 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -217,10 +217,10 @@ free_after_compilation (struct function *f)
 HOST_WIDE_INT
 get_frame_size (void)
 {
-  if (FRAME_GROWS_DOWNWARD)
-    return -frame_offset;
+  if (-crtl->frame.grows_downward)
+    return -crtl->frame.frame_offset;
   else
-    return frame_offset;
+    return crtl->frame.frame_offset;
 }
 
 /* Issue an error message and return TRUE if frame OFFSET overflows in
@@ -272,19 +272,14 @@ get_stack_local_alignment (tree type, machine_mode mode)
    given a start/length pair that lies at the end of the frame.  */
 
 static bool
-try_fit_stack_local (HOST_WIDE_INT start, HOST_WIDE_INT length,
+try_fit_stack_local (frame_info *frame,
+		     HOST_WIDE_INT start, HOST_WIDE_INT length,
 		     HOST_WIDE_INT size, unsigned int alignment,
 		     HOST_WIDE_INT *poffset)
 {
   HOST_WIDE_INT this_frame_offset;
   int frame_off, frame_alignment, frame_phase;
 
-  /* Calculate how many bytes the start of local variables is off from
-     stack alignment.  */
-  frame_alignment = PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT;
-  frame_off = STARTING_FRAME_OFFSET % frame_alignment;
-  frame_phase = frame_off ? frame_alignment - frame_off : 0;
-
   /* Round the frame offset to the specified alignment.  */
 
   /*  We must be careful here, since FRAME_OFFSET might be negative and
@@ -293,14 +288,14 @@ try_fit_stack_local (HOST_WIDE_INT start, HOST_WIDE_INT length,
       use logical operations which are unambiguous.  */
   if (FRAME_GROWS_DOWNWARD)
     this_frame_offset
-      = (FLOOR_ROUND (start + length - size - frame_phase,
+      = (FLOOR_ROUND (start + length - size - frame->phase,
 		      (unsigned HOST_WIDE_INT) alignment)
-	 + frame_phase);
+	 + frame->phase);
   else
     this_frame_offset
       = (CEIL_ROUND (start - frame_phase,
 		     (unsigned HOST_WIDE_INT) alignment)
-	 + frame_phase);
+	 + frame->phase);
 
   /* See if it fits.  If this space is at the edge of the frame,
      consider extending the frame to make it fit.  Our caller relies on
@@ -324,17 +319,17 @@ try_fit_stack_local (HOST_WIDE_INT start, HOST_WIDE_INT length,
    function's frame_space_list.  */
 
 static void
-add_frame_space (HOST_WIDE_INT start, HOST_WIDE_INT end)
+add_frame_space (frame_info *frame, HOST_WIDE_INT start, HOST_WIDE_INT end)
 {
   struct frame_space *space = ggc_alloc<frame_space> ();
-  space->next = crtl->frame_space_list;
-  crtl->frame_space_list = space;
+  space->next = frame->frame_space_list;
+  frame->frame_space_list = space;
   space->start = start;
   space->length = end - start;
 }
 
-/* Allocate a stack slot of SIZE bytes and return a MEM rtx for it
-   with machine mode MODE.
+/* Allocate a stack slot of SIZE bytes in FRAME and return a MEM rtx
+   for it with machine mode MODE.
 
    ALIGN controls the amount of alignment for the address of the slot:
    0 means according to MODE,
@@ -351,12 +346,13 @@ add_frame_space (HOST_WIDE_INT start, HOST_WIDE_INT end)
    We do not round to stack_boundary here.  */
 
 rtx
-assign_stack_local_1 (machine_mode mode, HOST_WIDE_INT size,
-		      int align, int kind)
+assign_stack_local_1 (frame_info *frame, machine_mode mode,
+		      HOST_WIDE_INT size, int align, int kind)
 {
   rtx x, addr;
   int bigend_correction = 0;
   HOST_WIDE_INT slot_offset = 0, old_frame_offset;
+  HOST_WIDE_INT frame_offset = frame->frame_offset;
   unsigned int alignment, alignment_in_bits;
 
   if (align == 0)
@@ -424,17 +420,17 @@ assign_stack_local_1 (machine_mode mode, HOST_WIDE_INT size,
 	{
 	  struct frame_space **psp;
 
-	  for (psp = &crtl->frame_space_list; *psp; psp = &(*psp)->next)
+	  for (psp = &frame->frame_space_list; *psp; psp = &(*psp)->next)
 	    {
 	      struct frame_space *space = *psp;
-	      if (!try_fit_stack_local (space->start, space->length, size,
-					alignment, &slot_offset))
+	      if (!try_fit_stack_local (frame, space->start, space->length,
+					size, alignment, &slot_offset))
 		continue;
 	      *psp = space->next;
 	      if (slot_offset > space->start)
-		add_frame_space (space->start, slot_offset);
+		add_frame_space (frame, space->start, slot_offset);
 	      if (slot_offset + size < space->start + space->length)
-		add_frame_space (slot_offset + size,
+		add_frame_space (frame, slot_offset + size,
 				 space->start + space->length);
 	      goto found_space;
 	    }
@@ -450,28 +446,30 @@ assign_stack_local_1 (machine_mode mode, HOST_WIDE_INT size,
 
   if (FRAME_GROWS_DOWNWARD)
     {
-      frame_offset -= size;
-      try_fit_stack_local (frame_offset, size, size, alignment, &slot_offset);
+      frame->x_frame_offset -= size;
+      try_fit_stack_local (frame, frame_offset, size, size, alignment,
+			   &slot_offset);
 
       if (kind & ASLK_RECORD_PAD)
 	{
 	  if (slot_offset > frame_offset)
-	    add_frame_space (frame_offset, slot_offset);
+	    add_frame_space (frame, frame_offset, slot_offset);
 	  if (slot_offset + size < old_frame_offset)
-	    add_frame_space (slot_offset + size, old_frame_offset);
+	    add_frame_space (frame, slot_offset + size, old_frame_offset);
 	}
     }
   else
     {
-      frame_offset += size;
-      try_fit_stack_local (old_frame_offset, size, size, alignment, &slot_offset);
+      frame->x_frame_offset += size;
+      try_fit_stack_local (frame, old_frame_offset, size, size, alignment,
+			   &slot_offset);
 
       if (kind & ASLK_RECORD_PAD)
 	{
 	  if (slot_offset > old_frame_offset)
-	    add_frame_space (old_frame_offset, slot_offset);
+	    add_frame_space (frame, old_frame_offset, slot_offset);
 	  if (slot_offset + size < frame_offset)
-	    add_frame_space (slot_offset + size, frame_offset);
+	    add_frame_space (frame, slot_offset + size, frame_offset);
 	}
     }
 
@@ -481,10 +479,11 @@ assign_stack_local_1 (machine_mode mode, HOST_WIDE_INT size,
   if (BYTES_BIG_ENDIAN && mode != BLKmode && GET_MODE_SIZE (mode) < size)
     bigend_correction = size - GET_MODE_SIZE (mode);
 
+  rtx reg = frame->base;
   /* If we have already instantiated virtual registers, return the actual
      address relative to the frame pointer.  */
-  if (virtuals_instantiated)
-    addr = plus_constant (Pmode, frame_pointer_rtx,
+  if (reg != frame_pointer_rtx || virtuals_instantiated)
+    addr = plus_constant (Pmode, reg,
 			  trunc_int_for_mode
 			  (slot_offset + bigend_correction
 			   + STARTING_FRAME_OFFSET, Pmode));
@@ -504,6 +503,7 @@ assign_stack_local_1 (machine_mode mode, HOST_WIDE_INT size,
   if (frame_offset_overflow (frame_offset, current_function_decl))
     frame_offset = 0;
 
+  frame->frame_offset = frame_offset;
   return x;
 }
 
@@ -573,7 +573,6 @@ struct temp_address_hasher : ggc_ptr_hash<temp_slot_address_entry>
 /* A table of addresses that represent a stack slot.  The table is a mapping
    from address RTXen to a temp slot.  */
 static GTY(()) hash_table<temp_address_hasher> *temp_slot_address_table;
-static size_t n_temp_slots_in_use;
 
 /* Removes temporary slot TEMP from LIST.  */
 
@@ -602,35 +601,35 @@ insert_slot_to_list (struct temp_slot *temp, struct temp_slot **list)
   *list = temp;
 }
 
-/* Returns the list of used temp slots at LEVEL.  */
+/* Returns the list of used temp slots at LEVEL in frame FRAME.  */
 
 static struct temp_slot **
-temp_slots_at_level (int level)
+temp_slots_at_level (frame_info *frame, int level)
 {
-  if (level >= (int) vec_safe_length (used_temp_slots))
-    vec_safe_grow_cleared (used_temp_slots, level + 1);
+  if (level >= (int) vec_safe_length (frame->used_temp_slots))
+    vec_safe_grow_cleared (frame->used_temp_slots, level + 1);
 
-  return &(*used_temp_slots)[level];
+  return &(*frame->used_temp_slots)[level];
 }
 
-/* Returns the maximal temporary slot level.  */
+/* Returns the maximal temporary slot level in frame FRAME.  */
 
 static int
-max_slot_level (void)
+max_slot_level (frame_info *frame)
 {
-  if (!used_temp_slots)
+  if (!frame->used_temp_slots)
     return -1;
 
-  return used_temp_slots->length () - 1;
+  return frame->used_temp_slots->length () - 1;
 }
 
-/* Moves temporary slot TEMP to LEVEL.  */
+/* Moves temporary slot TEMP to LEVEL in frame FRAME.  */
 
 static void
-move_slot_to_level (struct temp_slot *temp, int level)
+move_slot_to_level (frame_info *frame, struct temp_slot *temp, int level)
 {
-  cut_slot_from_list (temp, temp_slots_at_level (temp->level));
-  insert_slot_to_list (temp, temp_slots_at_level (level));
+  cut_slot_from_list (temp, temp_slots_at_level (frame, temp->level));
+  insert_slot_to_list (temp, temp_slots_at_level (frame, level));
   temp->level = level;
 }
 
@@ -1196,16 +1195,40 @@ pop_temp_slots (void)
   temp_slot_level--;
 }
 
+/* Initialize temporary slots for frame FRAME.  DOWNWARD is the value of
+   the applicable FRAME_GROWS_DOWNWARD setting.  BASE is the base register
+   for this frame.  */
+
+void
+init_temp_slots_frame (frame_info *frame, rtx base, bool downward)
+{
+  /* We have not allocated any temporaries yet.  */
+  frame->avail_temp_slots = 0;
+  vec_alloc (frame->used_temp_slots, 0);
+  frame->temp_slot_level = 0;
+  frame->n_temp_slots_in_use = 0;
+  frame->grows_downward = downward;
+  frame->base = base;
+}
+
 /* Initialize temporary slots.  */
 
 void
 init_temp_slots (void)
 {
-  /* We have not allocated any temporaries yet.  */
-  avail_temp_slots = 0;
-  vec_alloc (used_temp_slots, 0);
-  temp_slot_level = 0;
-  n_temp_slots_in_use = 0;
+  init_temp_slots_frame (&crtl->frame, frame_pointer_rtx,
+			 FRAME_GROWS_DOWNWARD);
+  init_temp_slots_frame (&crtl->rz_frame, stack_pointer_rtx, false);
+
+  /* Calculate how many bytes the start of local variables is off from
+     stack alignment.  This is only used for the regular frame, if it
+     becomes necessary to do something for the redzone frame, we should
+     add the necessary starting offset macro.  */
+  int align = PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT;
+  int off = STARTING_FRAME_OFFSET % align;
+
+  crtl->frame.phase = off ? align - off : 0;
+  crtl->rz_frame.phase = 0;
 
   /* Set up the table to map addresses to temp slots.  */
   if (! temp_slot_address_table)
diff --git a/gcc/function.h b/gcc/function.h
index b2e4f71..b436f73 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -563,7 +563,8 @@ extern HOST_WIDE_INT get_frame_size (void);
    return FALSE.  */
 extern bool frame_offset_overflow (HOST_WIDE_INT, tree);
 
-extern rtx assign_stack_local_1 (machine_mode, HOST_WIDE_INT, int, int);
+extern rtx assign_stack_local_1 (frame_info *, machine_mode, HOST_WIDE_INT,
+				 int, int);
 extern rtx assign_stack_local (machine_mode, HOST_WIDE_INT, int);
 extern rtx assign_stack_temp_for_type (machine_mode, HOST_WIDE_INT, tree);
 extern rtx assign_stack_temp (machine_mode, HOST_WIDE_INT);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-06 14:11   ` Sebastian Pop
  2015-11-06 14:32     ` Bernd Schmidt
@ 2015-11-06 18:43     ` Jeff Law
  1 sibling, 0 replies; 11+ messages in thread
From: Jeff Law @ 2015-11-06 18:43 UTC (permalink / raw)
  To: Sebastian Pop, Bernd Schmidt; +Cc: Abe, gcc-patches, Kyrill Tkachov

On 11/06/2015 07:10 AM, Sebastian Pop wrote:
> On Fri, Nov 6, 2015 at 2:56 AM, Bernd Schmidt <bschmidt@redhat.com> wrote:
>> Formatting problem, here and in a few other places. I didn't fully read the
>> patch this time around.
>>
>> I'm probably not reviewing further patches because I don't see this
>> progressing to a state where it's acceptable. Others may do so, but as far
>> as I'm concerned the patch is rejected.
>
> Bernd,
> I would like to ask you to focus on the technical part, and provide a
> review only based on technical reasons.
> Please ignore all formatting changes: I will help address all those changes.
> I will send a patch addressing all the comments you had in the current review.
A note that sometimes bad formatting make it fairly painful to review 
patches.  IMHO I strongly prefer to the submitter, even for an RFC, to 
make an honest attempt to have the code properly formatted.

Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-06 16:28         ` Bernd Schmidt
@ 2015-11-07 12:02           ` Bernhard Reutner-Fischer
  0 siblings, 0 replies; 11+ messages in thread
From: Bernhard Reutner-Fischer @ 2015-11-07 12:02 UTC (permalink / raw)
  To: Bernd Schmidt, Sebastian Pop, Bernd Schmidt
  Cc: Abe, gcc-patches, Kyrill Tkachov

On November 6, 2015 5:27:44 PM GMT+01:00, Bernd Schmidt <bernds_cb1@t-online.de> wrote:
>On 11/06/2015 04:52 PM, Sebastian Pop wrote:
>
>>> opinion). If you want a half-finished redzone allocator, I can send
>you a
>>> patch.
>>
>> Yes please.  Let's get it work.
>
>Here you go. This is incomplete and does not compile, but it shows the 
>direction I have in mind and isn't too far off. I had a similar patch 

--- a/gcc/function.c
+++ b/gcc/function.c
@@ -217,10 +217,10 @@ free_after_compilation (struct function *f)
 HOST_WIDE_INT
 get_frame_size (void)
 {
-  if (FRAME_GROWS_DOWNWARD)
-    return -frame_offset;
+  if (-crtl->frame.grows_downward)
+    return -crtl->frame.frame_offset;
   else
-    return frame_offset;
+    return crtl->frame.frame_offset;
 }
 
frame.grows_downward is a bool it seems and as such I wonder what the minus in the condition means or is supposed to achieve?
Something we (should?) warn about?

Just curious..
Cheers,


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
       [not found]     ` <563CE5C2.5060409@redhat.com>
@ 2015-11-10 21:35       ` Abe
  2015-11-11 15:01         ` Bernd Schmidt
  0 siblings, 1 reply; 11+ messages in thread
From: Abe @ 2015-11-10 21:35 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

> This conversation should be on-list btw.

ACK.


> What I'm saying is I don't see a reason for a "definitely always unsafe" state.
> Why would any access not be made safe if a scratchpad is used?

Because the RTL if-converter doesn`t "know" how to convert {everything that can be made safe using a scratchpad and is
unsafe otherwise}.  My patch is only designed to enable the conversion of half-hammocks with a single may-trap store.
Kyrill`s work to make the single-store RTL conversions also work for multiple stores under a single condition may
or may not mesh nicely with my code to produce the expected combined effect.  I have also not done any work yet
to enable the conversion of half-hammocks with a may-trap store _and_ a may-trap load, as in the following:

   if (condition)
     *dest = *src;
   // no "else" or "else if"


In summary, the 3 possible analysis outcomes are something like this:

   * safe even without a scratchpad

   * only safe with    a scratchpad, and we _do_ know how to convert it safely

   * currently unsafe because we don`t yet       know how to convert it safely



>>> Which target do you actually want this for?

>> Ideally, all targets WITH "cmove"-like ops and withOUT
>> pre-64-bit-ARM-style fully-conditional execution.

> Anything in particular you're working on?

Where I work we are focused on the AArch64 ISA.


> Do you have performance numbers anywhere?

I think my analysis work so far on this project is not yet complete enough for
public review, mainly because it does not include the benefit of profiling.

Doing the conversion only when the profile favors doing so should remove regressions
and make the average a stronger result.  As I previously mentioned, the latest version of
this patch only does the new if-conversion when the profile and the ["--param"-alterable]
threshold "tell" it to do so, except when the testing-oriented "--param" is used.

Sincerely,

Abe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-10 21:35       ` Abe
@ 2015-11-11 15:01         ` Bernd Schmidt
  2015-11-11 17:01           ` Abe
  0 siblings, 1 reply; 11+ messages in thread
From: Bernd Schmidt @ 2015-11-11 15:01 UTC (permalink / raw)
  To: Abe; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

On 11/10/2015 10:35 PM, Abe wrote:
> I wrote:
>> What I'm saying is I don't see a reason for a "definitely always
>> unsafe" state.
>> Why would any access not be made safe if a scratchpad is used?
>
> Because the RTL if-converter doesn`t "know" how to convert
> {everything that can be made safe using a scratchpad and is unsafe
> otherwise}. My patch is only designed to enable the conversion of
> half-hammocks with a single may-trap store.

Yeah, but how is that relevant to the question of whether a MEM is safe? 
The logic should be
  if mem is safe and we are allowed to speculate -> just do it
  otherwise mem is unsafe, so
    if we have prereqs like conditional moves -> use scratchpads
    otherwise fail

I don't see how a three-state property for a single MEM is necessary or 
helpful, and the implementation in the patch just roughly distinguishes 
between two different types of trap (invalid address vs. readonly 
memory). That seems irrelevant both to the question of whether something 
is safe or not, and to the question of whether we know how to perform 
the conversion.

You might argue that something that is known readonly will always fail 
if written to at runtime, but that's no different from any other kind of 
invalid address, and using a scratchpad prevents the write unless it 
would have happened without if-conversion.

> In summary, the 3 possible analysis outcomes are something like this:
>
>    * safe even without a scratchpad
>
>    * only safe with    a scratchpad, and we _do_ know how to convert it
> safely
>
>    * currently unsafe because we don`t yet       know how to convert it
> safely

This could be seen as a property of the block being converted, and is 
kind of implicit in the algorithm sketched above, but I don't see how it 
would be a property of the MEM that represents the store.

>> Do you have performance numbers anywhere?
>
> I think my analysis work so far on this project is not yet complete
> enough for public review, mainly because it does not include the
> benefit of  profiling.

I think performance numbers are a fairly important part of a submission 
like this where the transformation isn't an obvious improvement (as 
opposed to removing an instruction or suchlike).


Bernd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
  2015-11-11 15:01         ` Bernd Schmidt
@ 2015-11-11 17:01           ` Abe
  0 siblings, 0 replies; 11+ messages in thread
From: Abe @ 2015-11-11 17:01 UTC (permalink / raw)
  To: Bernd Schmidt; +Cc: Sebastian Pop, Kyrill Tkachov, gcc-patches

> I don't see how a three-state property for a single MEM is necessary or helpful

I guess I could coalesce those two callees into one callee that still returns only
a bool, but I was trying not to make gratuitous changes to the existing code.


> I think performance numbers are a fairly important part of a submission like this

Understood and agreed.

That having been said, I have already analyzed the assembly code that results from
my new if conversion, and it is clear that sometimes doing the conversion allows
other GCC passes to do a better job because the code in question is now one big
basic block; before that change the other passes in question were "nervous",
and therefor did not do the optimization that they otherwise would have done,
because they were unable to prove the correctness of the transformation.


 > where the transformation isn't an obvious improvement
 > (as opposed to removing an instruction or suchlike).

If_conversion can indirectly lead to the removal of _several_ instructions
due to the unification of basic blocks and the removal of labels,
such that other passes can see that there is no way [barring a malfunction
or human tampering e.g. via a debugger or a security exploit] for control
flow to enter in the middle and invalidate liveness assumptions.

I will paste in my source code for a simple torture test I wrote in order to check
the operation of the new scratchpad allocation algorithm, as well as the AArch64
[64-bit ARM] assembly code with and without my work.  Without adding any other
optimizations myself, GCC [at "-O3" in both] did much better with the conversion
than without it at compiling code with a repeated constant.  The scheduler was
also much more free to hoist loads and sink stores, thereby filling in
otherwise-empty "bubbles" in the CPU pipeline, thereby using machine cycles for
beneficial work instead of wasting those same cycles sitting around doing nothing
while waiting for data to be fetched from main RAM because it is not in cache.

Of note, for the test-case source code shown below, with my new if conversion
GCC is doing a _great_ job of re-using the value 127 across integer-size boundaries,
i.e. using the fact that the 64-bit value 127 has the 32-bit value 127 as its lower
32 bits, etc.  In the original test case, which has the assignments in opposite order,
GCC fails to do so across integer-size boundaries even with my new if conversion,
which probably indicates room for improvement in other optimization passes.
In fact, it even redundantly loads the 32-bit value 127 three times.
A coworker of mine said this last part is probably a sign of a bug in GCC.

Of course, if/when the conditional branches in question are not very predictable
and the data upon which they depend is frequently not in cache, then the
if conversion is an even bigger win than it is just by eliminating instructions.

The code I mentioned above follows my sign-off.

Sincerely,

Abe










char       C[9];
short      S[9];
int        I[9];
long       L[9];
long long LL[9];


void half_hammock_torture() {

   if (LL[1])  LL[2] = 127;
   if ( L[1])   L[2] = 127;
   if ( I[1])   I[2] = 127;
   if ( S[1])   S[2] = 127;
   if ( C[1])   C[2] = 127;

}





	.file	"spad-allocation-algorithm_torture_test___reversed_order.c"
	.text
	.align	2
	.align	3
	.global	half_hammock_torture
	.arch armv8-a+fp+simd
	//.tune generic
	.type	half_hammock_torture, %function
half_hammock_torture:
	adrp	x0, LL
	add	x0, x0, :lo12:LL
	ldr	x1, [x0, 8]
	cbz	x1, .L2
	mov	x1, 127
	str	x1, [x0, 16]
.L2:
	adrp	x0, L
	add	x0, x0, :lo12:L
	ldr	x1, [x0, 8]
	cbz	x1, .L3
	mov	x1, 127
	str	x1, [x0, 16]
.L3:
	adrp	x0, I
	add	x0, x0, :lo12:I
	ldr	w1, [x0, 4]
	cbz	w1, .L4
	mov	w1, 127
	str	w1, [x0, 8]
.L4:
	adrp	x0, S
	add	x0, x0, :lo12:S
	ldrsh	w1, [x0, 2]
	cbz	w1, .L5
	mov	w1, 127
	strh	w1, [x0, 4]
.L5:
	adrp	x0, C
	add	x0, x0, :lo12:C
	ldrb	w1, [x0, 1]
	cbz	w1, .L1
	mov	w1, 127
	strb	w1, [x0, 2]
.L1:
	ret
	.size	half_hammock_torture, .-half_hammock_torture
	.comm	LL,72,8
	.comm	L,72,8
	.comm	I,36,8
	.comm	S,18,8
	.comm	C,9,8
	.ident	"GCC: (GNU) 6.0.0 20151001 (experimental)"
	.section	.note.GNU-stack,"",%progbits





	.file	"spad-allocation-algorithm_torture_test___reversed_order.c"
	.text
	.align	2
	.align	3
	.global	half_hammock_torture
	.arch armv8-a+fp+simd
	//.tune generic
	.type	half_hammock_torture, %function
half_hammock_torture:
	adrp	x3, LL
	adrp	x2, L
	add	x3, x3, :lo12:LL
	add	x2, x2, :lo12:L
	adrp	x1, I
	adrp	x0, S
	add	x1, x1, :lo12:I
	add	x0, x0, :lo12:S
	ldr	x6, [x3, 8]
	sub	sp, sp, #16
	ldr	x5, [x2, 8]
	mov	x4, sp
	ldr	w7, [x1, 4]
	cmp	x6, xzr
	add	x3, x3, 16
	ldrsh	w6, [x0, 2]
	csel	x3, x4, x3, eq
	add	x2, x2, 16
	cmp	x5, xzr
	add	x1, x1, 8
	mov	x5, 127
	csel	x2, x4, x2, eq
	cmp	w7, wzr
	add	x0, x0, 4
	csel	x1, x4, x1, eq
	cmp	w6, wzr
	str	x5, [x3]
	csel	x0, x4, x0, eq
	str	x5, [x2]
	adrp	x2, C
	str	w5, [x1]
	add	x1, x2, :lo12:C
	strh	w5, [x0]
	add	x0, x1, 2
	ldrb	w1, [x1, 1]
	cmp	w1, wzr
	csel	x4, x4, x0, eq
	strb	w5, [x4]
	add	sp, sp, 16
	ret
	.size	half_hammock_torture, .-half_hammock_torture
	.comm	LL,72,8
	.comm	L,72,8
	.comm	I,36,8
	.comm	S,18,8
	.comm	C,9,8
	.ident	"GCC: (GNU) 6.0.0 20151001 (experimental)"
	.section	.note.GNU-stack,"",%progbits

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-11-11 17:01 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-05 23:43 improved RTL-level if conversion using scratchpads [half-hammock edition] Abe
2015-11-06 10:56 ` Bernd Schmidt
2015-11-06 14:11   ` Sebastian Pop
2015-11-06 14:32     ` Bernd Schmidt
2015-11-06 15:52       ` Sebastian Pop
2015-11-06 16:28         ` Bernd Schmidt
2015-11-07 12:02           ` Bernhard Reutner-Fischer
2015-11-06 18:43     ` Jeff Law
     [not found]   ` <563CDCAF.20703@yahoo.com>
     [not found]     ` <563CE5C2.5060409@redhat.com>
2015-11-10 21:35       ` Abe
2015-11-11 15:01         ` Bernd Schmidt
2015-11-11 17:01           ` Abe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).