public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [ARM] Fix PR middle-end/65958
@ 2015-06-11 16:15 Eric Botcazou
  2015-06-17 10:41 ` Ramana Radhakrishnan
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Botcazou @ 2015-06-11 16:15 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2535 bytes --]

Hi,

this fixes PR middle-end/65958 on the ARM, the architecture for which it was 
reported, by implementing stack checking by means of probing in the back-end.  
Other mainstream back-ends (alpha, i386, ia64, mips, rs6000, sparc) already 
have such an implementation.  The middle-end contains a generic implementation 
but it has severe limitations and can generate wrong code in conjunction with 
dynamic allocation patterns (PR 65958 is an example with dynamic arrays).

As for the other architectures, the patch implements stack probing for the 
static part of the frame, i.e. it generates stack probes in the prologue of 
functions which have a large frame or are non-leaf before establishing the 
frame.  Unfortunately, as is the case for x86-32, the scarcity of registers 
coupled with the calling conventions make the implementation a bit convoluted, 
but it works for both APCS and non-APCS frames, ARM and Thumb-2 modes.  We 
have been using it at AdaCore for a couple of years on Linux and VxWorks.

Tested on arm-eabi and arm-linux-gnueabi, OK for the mainline?


2015-06-11  Eric Botcazou  <ebotcazou@adacore.com>

	PR middle-end/65958
	* config/arm/linux-elf.h (STACK_CHECK_STATIC_BUILTIN): Define.
	* config/arm/arm-protos.h (output_probe_stack_range): Declare.
	* config/arm/arm.c: Include common/common-target.h.
	(use_return_insn): Return 0 if the static chain register was saved
	above a non-APCS frame.
	(arm_compute_static_chain_stack_bytes): Adjust for stack checking.
	(struct scratch_reg): New.
	(get_scratch_register_on_entry): New function.
	(release_scratch_register_on_entry): Likewise.
	(arm_emit_probe_stack_range): Likewise.
	(output_probe_stack_range): Likewise.
	(arm_expand_prologue): Factor out code dealing with the IP register
	for nested function and adjust it for stack checking.
	Invoke arm_emit_probe_stack_range if static builtin stack checking
	is enabled.
	(thumb1_expand_prologue): Sorry out if static builtin stack checking
	is enabled.
	(arm_expand_epilogue): Add the saved static chain register, if any, to
	the amount of pre-pushed registers to pop.
	(arm_frame_pointer_required): Return true if static stack checking is
	enabled and we want to catch the exception with the EABI unwinder.
	* config/arm/unspecs.md (UNSPEC_PROBE_STACK): New constant.
	(UNSPEC_PROBE_STACK_RANGE): Likewise.
	* config/arm/arm.md (probe_stack): New insn.
	(probe_stack_range): Likewise.


2015-06-11  Eric Botcazou  <ebotcazou@adacore.com>

	* gcc.target/arm/stack-checking.c: New test.


-- 
Eric Botcazou

[-- Attachment #2: pr65958.diff --]
[-- Type: text/x-patch, Size: 24413 bytes --]

Index: config/arm/linux-elf.h
===================================================================
--- config/arm/linux-elf.h	(revision 224264)
+++ config/arm/linux-elf.h	(working copy)
@@ -124,3 +124,6 @@
    to COPY relocated symbol in the executable.  See PR65780.  */
 #undef TARGET_BINDS_LOCAL_P
 #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
+
+/* Define this to be nonzero if static stack checking is supported.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 224264)
+++ config/arm/arm.c	(working copy)
@@ -72,6 +72,7 @@
 #include "target.h"
 #include "sched-int.h"
 #include "target-def.h"
+#include "common/common-target.h"
 #include "debug.h"
 #include "langhooks.h"
 #include "df.h"
@@ -3599,7 +3600,11 @@ use_return_insn (int iscond, rtx sibling
       /* Or if there is a stack adjustment.  However, if the stack pointer
 	 is saved on the stack, we can use a pre-incrementing stack load.  */
       || !(stack_adjust == 0 || (TARGET_APCS_FRAME && frame_pointer_needed
-				 && stack_adjust == 4)))
+				 && stack_adjust == 4))
+      /* Or if the static chain register was saved above the frame, under the
+	 assumption that the stack pointer isn't saved on the stack.  */
+      || (!(TARGET_APCS_FRAME && frame_pointer_needed)
+          && arm_compute_static_chain_stack_bytes() != 0))
     return 0;
 
   saved_int_regs = offsets->saved_regs_mask;
@@ -19080,8 +19085,10 @@ static int
 arm_compute_static_chain_stack_bytes (void)
 {
   /* See the defining assertion in arm_expand_prologue.  */
-  if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM
-      && IS_NESTED (arm_current_func_type ())
+  if (IS_NESTED (arm_current_func_type ())
+      && ((TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM)
+	  || (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+	      && !df_regs_ever_live_p (LR_REGNUM)))
       && arm_r3_live_at_start_p ()
       && crtl->args.pretend_args_size == 0)
     return 4;
@@ -19176,7 +19183,6 @@ arm_compute_save_reg_mask (void)
   return save_reg_mask;
 }
 
-
 /* Compute a bit mask of which registers need to be
    saved on the stack for the current function.  */
 static unsigned long
@@ -21004,6 +21010,240 @@ thumb_set_frame_pointer (arm_stack_offse
   RTX_FRAME_RELATED_P (insn) = 1;
 }
 
+struct scratch_reg {
+  rtx reg;
+  bool saved;
+};
+
+/* Return a short-lived scratch register for use as a 2nd scratch register on
+   function entry after the registers are saved in the prologue.  This register
+   must be released by means of release_scratch_register_on_entry.  IP is not
+   considered since it is always used as the 1st scratch register if available.
+
+   REGNO1 is the index number of the 1st scratch register and LIVE_REGS is the
+   mask of live registers.  */
+
+static void
+get_scratch_register_on_entry (struct scratch_reg *sr, unsigned int regno1,
+			       unsigned long live_regs)
+{
+  int regno = -1;
+
+  sr->saved = false;
+
+  if (regno1 != LR_REGNUM && (live_regs & (1 << LR_REGNUM)) != 0)
+    regno = LR_REGNUM;
+  else
+    {
+      unsigned int i;
+
+      for (i = 4; i < 11; i++)
+	if (regno1 != i && (live_regs & (1 << i)) != 0)
+	  {
+	    regno = i;
+	    break;
+	  }
+
+      if (regno < 0)
+	{
+	  /* If IP is used as the 1st scratch register for a nested function,
+	     then either r3 wasn't available or is used to preserve IP.  */
+	  if (regno1 == IP_REGNUM && IS_NESTED (arm_current_func_type ()))
+	    regno1 = 3;
+	  regno = (regno1 == 3 ? 2 : 3);
+	  sr->saved
+	    = REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)),
+			       regno);
+	}
+    }
+
+  sr->reg = gen_rtx_REG (SImode, regno);
+  if (sr->saved)
+    {
+      rtx addr = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
+      rtx insn = emit_set_insn (gen_frame_mem (SImode, addr), sr->reg);
+      rtx x = gen_rtx_SET (stack_pointer_rtx,
+		           plus_constant (Pmode, stack_pointer_rtx, -4));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+    }
+}
+
+/* Release a scratch register obtained from the preceding function.  */
+
+static void
+release_scratch_register_on_entry (struct scratch_reg *sr)
+{
+  if (sr->saved)
+    {
+      rtx addr = gen_rtx_POST_INC (Pmode, stack_pointer_rtx);
+      rtx insn = emit_set_insn (sr->reg, gen_frame_mem (SImode, addr));
+      rtx x = gen_rtx_SET (stack_pointer_rtx,
+			   plus_constant (Pmode, stack_pointer_rtx, 4));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+    }
+}
+
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+#if PROBE_INTERVAL > 4096
+#error Cannot use indexed addressing mode for stack probing
+#endif
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  REGNO1
+   is the index number of the 1st scratch register and LIVE_REGS is the
+   mask of live registers.  */
+
+static void
+arm_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
+			    unsigned int regno1, unsigned long live_regs)
+{
+  rtx reg1 = gen_rtx_REG (Pmode, regno1);
+
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  */
+  if (size <= PROBE_INTERVAL)
+    {
+      emit_move_insn (reg1, GEN_INT (first + PROBE_INTERVAL));
+      emit_set_insn (reg1, gen_rtx_MINUS (Pmode, stack_pointer_rtx, reg1));
+      emit_stack_probe (plus_constant (Pmode, reg1, PROBE_INTERVAL - size));
+    }
+
+  /* The run-time loop is made up of 10 insns in the generic case while the
+     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
+  else if (size <= 5 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i, rem;
+
+      emit_move_insn (reg1, GEN_INT (first + PROBE_INTERVAL));
+      emit_set_insn (reg1, gen_rtx_MINUS (Pmode, stack_pointer_rtx, reg1));
+      emit_stack_probe (reg1);
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
+	 it exceeds SIZE.  If only two probes are needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	{
+	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -PROBE_INTERVAL));
+	  emit_stack_probe (reg1);
+	}
+
+      rem = size - (i - PROBE_INTERVAL);
+      if (rem > 4095 || (TARGET_THUMB2 && rem > 255))
+	{
+	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -PROBE_INTERVAL));
+	  emit_stack_probe (plus_constant (Pmode, reg1, PROBE_INTERVAL - rem));
+	}
+      else
+	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size;
+      struct scratch_reg sr;
+
+      get_scratch_register_on_entry (&sr, regno1, live_regs);
+
+      emit_move_insn (reg1, GEN_INT (first));
+
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = size & -PROBE_INTERVAL;
+      emit_move_insn (sr.reg, GEN_INT (rounded_size));
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* TEST_ADDR = SP + FIRST.  */
+      emit_set_insn (reg1, gen_rtx_MINUS (Pmode, stack_pointer_rtx, reg1));
+
+      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
+      emit_set_insn (sr.reg, gen_rtx_MINUS (Pmode, reg1, sr.reg));
+
+
+      /* Step 3: the loop
+
+	 while (TEST_ADDR != LAST_ADDR)
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+
+	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+	 until it is equal to ROUNDED_SIZE.  */
+
+      emit_insn (gen_probe_stack_range (reg1, reg1, sr.reg));
+
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  HOST_WIDE_INT rem = size - rounded_size;
+
+	  if (rem > 4095 || (TARGET_THUMB2 && rem > 255))
+	    {
+	      emit_set_insn (sr.reg,
+			     plus_constant (Pmode, sr.reg, -PROBE_INTERVAL));
+	      emit_stack_probe (plus_constant (Pmode, sr.reg,
+					       PROBE_INTERVAL - rem));
+	    }
+	  else
+	    emit_stack_probe (plus_constant (Pmode, sr.reg, -rem));
+	}
+
+      release_scratch_register_on_entry (&sr);
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
+   absolute addresses.  */
+
+const char *
+output_probe_stack_range (rtx reg1, rtx reg2)
+{
+  static int labelno = 0;
+  char loop_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+   /* Test if TEST_ADDR == LAST_ADDR.  */
+  xops[0] = reg1;
+  xops[1] = reg2;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  if (TARGET_THUMB2)
+    fputs ("\tittt\tne\n", asm_out_file);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("subne\t%0, %0, %1", xops);
+
+  /* Probe at TEST_ADDR and branch.  */
+  output_asm_insn ("strne\tr0, [%0, #0]", xops);
+  fputs ("\tbne\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  return "";
+}
+
 /* Generate the prologue instructions for entry into an ARM or Thumb-2
    function.  */
 void
@@ -21018,7 +21258,9 @@ arm_expand_prologue (void)
   int saved_pretend_args = 0;
   int saved_regs = 0;
   unsigned HOST_WIDE_INT args_to_push;
+  HOST_WIDE_INT size;
   arm_stack_offsets *offsets;
+  bool clobber_ip;
 
   func_type = arm_current_func_type ();
 
@@ -21069,9 +21311,88 @@ arm_expand_prologue (void)
       emit_insn (gen_movsi (stack_pointer_rtx, r1));
     }
 
-  /* For APCS frames, if IP register is clobbered
-     when creating frame, save that register in a special
-     way.  */
+  /* The static chain register is the same as the IP register.  If it is
+     clobbered when creating the frame, we need to save and restore it.  */
+  clobber_ip = IS_NESTED (func_type)
+	       && ((TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM)
+		   || (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+		       && !df_regs_ever_live_p (LR_REGNUM)
+		       && arm_r3_live_at_start_p ()));
+
+  /* Find somewhere to store IP whilst the frame is being created.
+     We try the following places in order:
+
+       1. The last argument register r3 if it is available.
+       2. A slot on the stack above the frame if there are no
+	  arguments to push onto the stack.
+       3. Register r3 again, after pushing the argument registers
+	  onto the stack, if this is a varargs function.
+       4. The last slot on the stack created for the arguments to
+	  push, if this isn't a varargs function.
+
+     Note - we only need to tell the dwarf2 backend about the SP
+     adjustment in the second variant; the static chain register
+     doesn't need to be unwound, as it doesn't contain a value
+     inherited from the caller.  */
+  if (clobber_ip)
+    {
+      if (!arm_r3_live_at_start_p ())
+	insn = emit_set_insn (gen_rtx_REG (SImode, 3), ip_rtx);
+      else if (args_to_push == 0)
+	{
+	  rtx addr, dwarf;
+
+	  gcc_assert(arm_compute_static_chain_stack_bytes() == 4);
+	  saved_regs += 4;
+
+	  addr = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
+	  insn = emit_set_insn (gen_frame_mem (SImode, addr), ip_rtx);
+	  fp_offset = 4;
+
+	  /* Just tell the dwarf backend that we adjusted SP.  */
+	  dwarf = gen_rtx_SET (stack_pointer_rtx,
+			       plus_constant (Pmode, stack_pointer_rtx,
+					      -fp_offset));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+	}
+      else
+	{
+	  /* Store the args on the stack.  */
+	  if (cfun->machine->uses_anonymous_args)
+	    {
+	      insn = emit_multi_reg_push ((0xf0 >> (args_to_push / 4)) & 0xf,
+					  (0xf0 >> (args_to_push / 4)) & 0xf);
+	      emit_set_insn (gen_rtx_REG (SImode, 3), ip_rtx);
+	      saved_pretend_args = 1;
+	    }
+	  else
+	    {
+	      rtx addr, dwarf;
+
+	      if (args_to_push == 4)
+		addr = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
+	      else
+		addr = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx,
+					   plus_constant (Pmode,
+							  stack_pointer_rtx,
+							  -args_to_push));
+
+	      insn = emit_set_insn (gen_frame_mem (SImode, addr), ip_rtx);
+
+	      /* Just tell the dwarf backend that we adjusted SP.  */
+	      dwarf = gen_rtx_SET (stack_pointer_rtx,
+				   plus_constant (Pmode, stack_pointer_rtx,
+						  -args_to_push));
+	      add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+	    }
+
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  fp_offset = args_to_push;
+	  args_to_push = 0;
+	}
+    }
+
   if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM)
     {
       if (IS_INTERRUPT (func_type))
@@ -21093,86 +21414,6 @@ arm_expand_prologue (void)
 	     Anyway this instruction is not really part of the stack
 	     frame creation although it is part of the prologue.  */
 	}
-      else if (IS_NESTED (func_type))
-	{
-	  /* The static chain register is the same as the IP register
-	     used as a scratch register during stack frame creation.
-	     To get around this need to find somewhere to store IP
-	     whilst the frame is being created.  We try the following
-	     places in order:
-
-	       1. The last argument register r3 if it is available.
-	       2. A slot on the stack above the frame if there are no
-		  arguments to push onto the stack.
-	       3. Register r3 again, after pushing the argument registers
-	          onto the stack, if this is a varargs function.
-	       4. The last slot on the stack created for the arguments to
-		  push, if this isn't a varargs function.
-
-	     Note - we only need to tell the dwarf2 backend about the SP
-	     adjustment in the second variant; the static chain register
-	     doesn't need to be unwound, as it doesn't contain a value
-	     inherited from the caller.  */
-
-	  if (!arm_r3_live_at_start_p ())
-	    insn = emit_set_insn (gen_rtx_REG (SImode, 3), ip_rtx);
-	  else if (args_to_push == 0)
-	    {
-	      rtx addr, dwarf;
-
-	      gcc_assert(arm_compute_static_chain_stack_bytes() == 4);
-	      saved_regs += 4;
-
-	      addr = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
-	      insn = emit_set_insn (gen_frame_mem (SImode, addr), ip_rtx);
-	      fp_offset = 4;
-
-	      /* Just tell the dwarf backend that we adjusted SP.  */
-	      dwarf = gen_rtx_SET (stack_pointer_rtx,
-				   plus_constant (Pmode, stack_pointer_rtx,
-						  -fp_offset));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
-	    }
-	  else
-	    {
-	      /* Store the args on the stack.  */
-	      if (cfun->machine->uses_anonymous_args)
-		{
-		  insn
-		    = emit_multi_reg_push ((0xf0 >> (args_to_push / 4)) & 0xf,
-					   (0xf0 >> (args_to_push / 4)) & 0xf);
-		  emit_set_insn (gen_rtx_REG (SImode, 3), ip_rtx);
-		  saved_pretend_args = 1;
-		}
-	      else
-		{
-		  rtx addr, dwarf;
-
-		  if (args_to_push == 4)
-		    addr = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
-		  else
-		    addr
-		      = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx,
-					    plus_constant (Pmode,
-							   stack_pointer_rtx,
-							   -args_to_push));
-
-		  insn = emit_set_insn (gen_frame_mem (SImode, addr), ip_rtx);
-
-		  /* Just tell the dwarf backend that we adjusted SP.  */
-		  dwarf
-		    = gen_rtx_SET (stack_pointer_rtx,
-				   plus_constant (Pmode, stack_pointer_rtx,
-						  -args_to_push));
-		  add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
-		}
-
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      fp_offset = args_to_push;
-	      args_to_push = 0;
-	    }
-	}
 
       insn = emit_set_insn (ip_rtx,
 			    plus_constant (Pmode, stack_pointer_rtx,
@@ -21270,34 +21511,60 @@ arm_expand_prologue (void)
 	  insn = GEN_INT (-(4 + args_to_push + fp_offset));
 	  insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx, ip_rtx, insn));
 	  RTX_FRAME_RELATED_P (insn) = 1;
-
-	  if (IS_NESTED (func_type))
-	    {
-	      /* Recover the static chain register.  */
-	      if (!arm_r3_live_at_start_p () || saved_pretend_args)
-		insn = gen_rtx_REG (SImode, 3);
-	      else
-		{
-		  insn = plus_constant (Pmode, hard_frame_pointer_rtx, 4);
-		  insn = gen_frame_mem (SImode, insn);
-		}
-	      emit_set_insn (ip_rtx, insn);
-	      /* Add a USE to stop propagate_one_insn() from barfing.  */
-	      emit_insn (gen_force_register_use (ip_rtx));
-	    }
 	}
       else
 	{
-	  insn = GEN_INT (saved_regs - 4);
+	  insn = GEN_INT (saved_regs - (4 + fp_offset));
 	  insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx,
 					stack_pointer_rtx, insn));
 	  RTX_FRAME_RELATED_P (insn) = 1;
 	}
     }
 
+  size = offsets->outgoing_args - offsets->saved_args;
   if (flag_stack_usage_info)
-    current_function_static_stack_size
-      = offsets->outgoing_args - offsets->saved_args;
+    current_function_static_stack_size = size;
+
+  /* If this isn't an interrupt service routine and we have a frame, then do
+     stack checking.  We use IP as the first scratch register, except for the
+     non-APCS nested functions if LR or r3 are available (see clobber_ip).  */
+  if (!IS_INTERRUPT (func_type)
+      && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    {
+      unsigned int regno;
+
+      if (!IS_NESTED (func_type) || clobber_ip)
+	regno = IP_REGNUM;
+      else if (df_regs_ever_live_p (LR_REGNUM))
+	regno = LR_REGNUM;
+      else
+	regno = 3;
+
+      if (crtl->is_leaf && !cfun->calls_alloca)
+	{
+	  if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
+	    arm_emit_probe_stack_range (STACK_CHECK_PROTECT,
+					size - STACK_CHECK_PROTECT,
+					regno, live_regs_mask);
+	}
+      else if (size > 0)
+	arm_emit_probe_stack_range (STACK_CHECK_PROTECT, size,
+				    regno, live_regs_mask);
+    }
+
+  /* Recover the static chain register.  */
+  if (clobber_ip)
+    {
+      if (!arm_r3_live_at_start_p () || saved_pretend_args)
+	insn = gen_rtx_REG (SImode, 3);
+      else
+	{
+	  insn = plus_constant (Pmode, hard_frame_pointer_rtx, 4);
+	  insn = gen_frame_mem (SImode, insn);
+	}
+      emit_set_insn (ip_rtx, insn);
+      emit_insn (gen_force_register_use (ip_rtx));
+    }
 
   if (offsets->outgoing_args != offsets->saved_args + saved_regs)
     {
@@ -24258,6 +24525,7 @@ thumb1_expand_prologue (void)
   rtx_insn *insn;
 
   HOST_WIDE_INT amount;
+  HOST_WIDE_INT size;
   arm_stack_offsets *offsets;
   unsigned long func_type;
   int regno;
@@ -24492,9 +24760,13 @@ thumb1_expand_prologue (void)
     emit_move_insn (gen_rtx_REG (Pmode, ARM_HARD_FRAME_POINTER_REGNUM),
 		    stack_pointer_rtx);
 
+  size = offsets->outgoing_args - offsets->saved_args;
   if (flag_stack_usage_info)
-    current_function_static_stack_size
-      = offsets->outgoing_args - offsets->saved_args;
+    current_function_static_stack_size = size;
+
+  /* If we have a frame, then do stack checking.  FIXME: not implemented.  */
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK && size)
+    sorry ("-fstack-check=specific for THUMB1");
 
   amount = offsets->outgoing_args - offsets->saved_regs;
   amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
@@ -25065,14 +25337,16 @@ arm_expand_epilogue (bool really_return)
         return;
     }
 
-  if (crtl->args.pretend_args_size)
+  amount
+    = crtl->args.pretend_args_size + arm_compute_static_chain_stack_bytes();
+  if (amount)
     {
       int i, j;
       rtx dwarf = NULL_RTX;
       rtx_insn *tmp =
 	emit_insn (gen_addsi3 (stack_pointer_rtx,
 			       stack_pointer_rtx,
-			       GEN_INT (crtl->args.pretend_args_size)));
+			       GEN_INT (amount)));
 
       RTX_FRAME_RELATED_P (tmp) = 1;
 
@@ -25091,7 +25365,7 @@ arm_expand_epilogue (bool really_return)
 	      }
 	  REG_NOTES (tmp) = dwarf;
 	}
-      arm_add_cfa_adjust_cfa_note (tmp, crtl->args.pretend_args_size,
+      arm_add_cfa_adjust_cfa_note (tmp, amount,
 				   stack_pointer_rtx, stack_pointer_rtx);
     }
 
@@ -27102,9 +27376,45 @@ arm_order_regs_for_local_alloc (void)
 bool
 arm_frame_pointer_required (void)
 {
-  return (cfun->has_nonlocal_label
-          || SUBTARGET_FRAME_POINTER_REQUIRED
-          || (TARGET_ARM && TARGET_APCS_FRAME && ! leaf_function_p ()));
+  if (SUBTARGET_FRAME_POINTER_REQUIRED)
+    return true;
+
+  /* If the function receives nonlocal gotos, it needs to save the frame
+     pointer in the nonlocal_goto_save_area object.  */
+  if (cfun->has_nonlocal_label)
+    return true;
+
+  /* The frame pointer is required for non-leaf APCS frames.  */
+  if (TARGET_ARM && TARGET_APCS_FRAME && !leaf_function_p ())
+    return true;
+
+  /* If we are probing the stack in the prologue, we will have a faulting
+     instruction prior to the stack adjustment and this requires a frame
+     pointer if we want to catch the exception using the EABI unwinder.  */
+  if (!IS_INTERRUPT (arm_current_func_type ())
+      && flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+      && arm_except_unwind_info (&global_options) == UI_TARGET
+      && cfun->can_throw_non_call_exceptions)
+    {
+      HOST_WIDE_INT size = get_frame_size ();
+
+      /* That's irrelevant if there is no stack adjustment.  */
+      if (size <= 0)
+	return false;
+
+      /* That's relevant only if there is a stack probe.  */
+      if (crtl->is_leaf && !cfun->calls_alloca)
+	{
+	  /* We don't have the final size of the frame so adjust.  */
+	  size += 32 * UNITS_PER_WORD;
+	  if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
+	    return true;
+	}
+      else
+	return true;
+    }
+
+  return false;
 }
 
 /* Only thumb1 can't support conditional execution, so return true if
Index: config/arm/unspecs.md
===================================================================
--- config/arm/unspecs.md	(revision 224264)
+++ config/arm/unspecs.md	(working copy)
@@ -83,6 +83,8 @@ (define_c_enum "unspec" [
                         ; FPSCR rounding mode and signal inexactness.
   UNSPEC_VRINTA         ; Represent a float to integral float rounding
                         ; towards nearest, ties away from zero.
+  UNSPEC_PROBE_STACK    ; Probe stack memory reference
+  UNSPEC_PROBE_STACK_RANGE ; Probe stack range
 ])
 
 (define_c_enum "unspec" [
Index: config/arm/arm-protos.h
===================================================================
--- config/arm/arm-protos.h	(revision 224264)
+++ config/arm/arm-protos.h	(working copy)
@@ -146,6 +146,7 @@ extern const char *output_add_immediate
 extern const char *arithmetic_instr (rtx, int);
 extern void output_ascii_pseudo_op (FILE *, const unsigned char *, int);
 extern const char *output_return_instruction (rtx, bool, bool, bool);
+extern const char *output_probe_stack_range (rtx, rtx);
 extern void arm_poke_function_name (FILE *, const char *);
 extern void arm_final_prescan_insn (rtx_insn *);
 extern int arm_debugger_arg_offset (int, rtx);
Index: config/arm/arm.md
===================================================================
--- config/arm/arm.md	(revision 224264)
+++ config/arm/arm.md	(working copy)
@@ -8171,6 +8171,27 @@ (define_insn "blockage"
    (set_attr "type" "block")]
 )
 
+(define_insn "probe_stack"
+  [(set (match_operand 0 "memory_operand" "=m")
+        (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
+  "TARGET_32BIT"
+{
+  return "str%?\\tr0, %0";
+}
+  [(set_attr "type" "store1")
+   (set_attr "predicable" "yes")]
+)
+
+(define_insn "probe_stack_range"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(match_operand:SI 1 "register_operand" "0")
+			     (match_operand:SI 2 "register_operand" "r")]
+			     UNSPEC_PROBE_STACK_RANGE))]
+  "TARGET_32BIT"
+{
+  return output_probe_stack_range (operands[0], operands[2]);
+})
+
 (define_expand "casesi"
   [(match_operand:SI 0 "s_register_operand" "")	; index to jump on
    (match_operand:SI 1 "const_int_operand" "")	; lower bound

[-- Attachment #3: stack-checking.c --]
[-- Type: text/x-csrc, Size: 280 bytes --]

/* { dg-do run { target { *-*-linux* } } } */
/* { dg-options "-fstack-check" } */

int main(void)
{
  char *p;
  if (1)
    {
      char i[48];
      p = __builtin_alloca(8);
      p[0] = 1;
    }

  if (1)
    {
      char i[48], j[64];
      j[48] = 0;
    }

  return !p[0];
}

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-06-11 16:15 [ARM] Fix PR middle-end/65958 Eric Botcazou
@ 2015-06-17 10:41 ` Ramana Radhakrishnan
  2015-06-18 19:03   ` Eric Botcazou
  0 siblings, 1 reply; 24+ messages in thread
From: Ramana Radhakrishnan @ 2015-06-17 10:41 UTC (permalink / raw)
  To: Eric Botcazou, gcc-patches

I am not very familiar with this feature entirely so please bear with me 
during review and will find some time to do some experiments with the 
feature during this week, but a couple of things with respect to the 
patch immediately spring to mind.


> +(define_insn "probe_stack_range"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +	(unspec_volatile:SI [(match_operand:SI 1 "register_operand" "0")
> +			     (match_operand:SI 2 "register_operand" "r")]
> +			     UNSPEC_PROBE_STACK_RANGE))]
> +  "TARGET_32BIT"
> +{
> +  return output_probe_stack_range (operands[0], operands[2]);
> +})
> +

Please mark this pattern with (set_attr "type" "multiple").

While I suspect that stack probing is done before any insns with invalid 
constants in the function, it would be better to model the length of 
this insn so that the minipool logic is not confused later in terms of 
placement of constant pools.

Shouldn't the pattern contain clobbers for the CC register or is that 
unnecessary for the same reason as above ?

Additionally please add

(set_attr "conds" "clob")

to this pattern so that the CCFSM state machine doesn't go awry in any 
of these cases.


regards
Ramana








^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-06-17 10:41 ` Ramana Radhakrishnan
@ 2015-06-18 19:03   ` Eric Botcazou
  2015-07-06 15:46     ` Ramana Radhakrishnan
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Botcazou @ 2015-06-18 19:03 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches

> Please mark this pattern with (set_attr "type" "multiple").

Done.

> While I suspect that stack probing is done before any insns with invalid
> constants in the function, it would be better to model the length of
> this insn so that the minipool logic is not confused later in terms of
> placement of constant pools.

OK, I can put an upper bound.

> Shouldn't the pattern contain clobbers for the CC register or is that
> unnecessary for the same reason as above ?

That's unnecessary, UNSPEC_VOLATILEs are optimization barriers so no CC-
related instructions can be moved up to before the instruction.

> Additionally please add
> 
> (set_attr "conds" "clob")
> 
> to this pattern so that the CCFSM state machine doesn't go awry in any
> of these cases.

Also done.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-06-18 19:03   ` Eric Botcazou
@ 2015-07-06 15:46     ` Ramana Radhakrishnan
  2015-09-20 21:05       ` Christophe Lyon
  2015-10-06 10:11       ` Eric Botcazou
  0 siblings, 2 replies; 24+ messages in thread
From: Ramana Radhakrishnan @ 2015-07-06 15:46 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches



On 18/06/15 20:02, Eric Botcazou wrote:
>> Please mark this pattern with (set_attr "type" "multiple").
> 
> Done.
> 
>> While I suspect that stack probing is done before any insns with invalid
>> constants in the function, it would be better to model the length of
>> this insn so that the minipool logic is not confused later in terms of
>> placement of constant pools.
> 
> OK, I can put an upper bound.
> 
>> Shouldn't the pattern contain clobbers for the CC register or is that
>> unnecessary for the same reason as above ?
> 
> That's unnecessary, UNSPEC_VOLATILEs are optimization barriers so no CC-
> related instructions can be moved up to before the instruction.
> 
>> Additionally please add
>>
>> (set_attr "conds" "clob")
>>
>> to this pattern so that the CCFSM state machine doesn't go awry in any
>> of these cases.
> 
> Also done.
> 

Thanks - I have no further comments on this patch. We probably need to implement the same on AArch64 too in order to avoid similar problems.


OK with the afore mentioned changes.

regards
Ramana

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-07-06 15:46     ` Ramana Radhakrishnan
@ 2015-09-20 21:05       ` Christophe Lyon
  2015-09-21  8:18         ` Eric Botcazou
  2015-10-06 10:11       ` Eric Botcazou
  1 sibling, 1 reply; 24+ messages in thread
From: Christophe Lyon @ 2015-09-20 21:05 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Eric Botcazou, gcc-patches

Hi Eric,


On 6 July 2015 at 17:46, Ramana Radhakrishnan
<ramana.radhakrishnan@foss.arm.com> wrote:
>
>
> On 18/06/15 20:02, Eric Botcazou wrote:
>>> Please mark this pattern with (set_attr "type" "multiple").
>>
>> Done.
>>
>>> While I suspect that stack probing is done before any insns with invalid
>>> constants in the function, it would be better to model the length of
>>> this insn so that the minipool logic is not confused later in terms of
>>> placement of constant pools.
>>
>> OK, I can put an upper bound.
>>
>>> Shouldn't the pattern contain clobbers for the CC register or is that
>>> unnecessary for the same reason as above ?
>>
>> That's unnecessary, UNSPEC_VOLATILEs are optimization barriers so no CC-
>> related instructions can be moved up to before the instruction.
>>
>>> Additionally please add
>>>
>>> (set_attr "conds" "clob")
>>>
>>> to this pattern so that the CCFSM state machine doesn't go awry in any
>>> of these cases.
>>
>> Also done.
>>
>
> Thanks - I have no further comments on this patch. We probably need to implement the same on AArch64 too in order to avoid similar problems.
>
>
> OK with the afore mentioned changes.
>
> regards
> Ramana

On targets using thumb1, I can see:
- the new test failing (you should probably add a dg-skip or an
effective-target directive)
- gcc.dg/pr48134.c now fails, saying:
sorry, unimplemented: -fstack-check=specific for THUMB1
while the error message is the same as for your new test, I'm
wondering why/how it actually passed before your patch?

Christophe.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-09-20 21:05       ` Christophe Lyon
@ 2015-09-21  8:18         ` Eric Botcazou
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-09-21  8:18 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches, Ramana Radhakrishnan

> On targets using thumb1, I can see:
> - the new test failing (you should probably add a dg-skip or an
> effective-target directive)

OK, I have added:

/* { dg-skip-if "" { arm_thumb1 } } */

as in the ivopts-orig_biv-inc.c test.

> - gcc.dg/pr48134.c now fails, saying:
> sorry, unimplemented: -fstack-check=specific for THUMB1
> while the error message is the same as for your new test, I'm
> wondering why/how it actually passed before your patch?

Yeah, it's a bug, -fstack-check=specific should have been rejected before for 
all ARM targets since it was not implemented.  Will fix.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-07-06 15:46     ` Ramana Radhakrishnan
  2015-09-20 21:05       ` Christophe Lyon
@ 2015-10-06 10:11       ` Eric Botcazou
  2015-10-06 13:43         ` Ramana Radhakrishnan
                           ` (2 more replies)
  1 sibling, 3 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-10-06 10:11 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1708 bytes --]

> Thanks - I have no further comments on this patch. We probably need to
> implement the same on AArch64 too in order to avoid similar problems.

Here's the implementation for aarch64, very similar but simpler since there is 
no shortage of scratch registers; the only thing to note is the new blockage 
pattern.  This was tested on real hardware but not with Linux, instead with 
Darwin (experimental port of the toolchain to iOS) and makes it possible to 
pass ACATS (Ada conformance testsuite which requires stack checking).

There is also a couple of tweaks for the ARM implementation: a cosmetic one 
for the probe_stack pattern and one for the output_probe_stack_range loop.


2015-10-06  Tristan Gingold  <gingold@adacore.com>
            Eric Botcazou  <ebotcazou@adacore.com>

        PR middle-end/65958
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
	Declare.
	* config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
	UNSPEC_PROBE_STACK_RANGE.
	(blockage): New instruction.
	(probe_stack_range): Likewise.
	* config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
	function.
	(aarch64_output_probe_stack_range): Likewise.
	(aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
	static builtin stack checking is enabled.
	* config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
	Define.

	* config/arm/arm.c (arm_emit_probe_stack_range): Adjust comment.
	(output_probe_stack_range): Rotate the loop and simplify.
	(thumb1_expand_prologue): Tweak sorry message.
	* config/arm/arm.md (probe_stack): Use bare string.


2015-10-06  Eric Botcazou  <ebotcazou@adacore.com>

        * gcc.target/aarch64/stack-checking.c: New test.

-- 
Eric Botcazou

[-- Attachment #2: pr65958-2.diff --]
[-- Type: text/x-patch, Size: 11147 bytes --]

Index: config/aarch64/aarch64-linux.h
===================================================================
--- config/aarch64/aarch64-linux.h	(revision 228512)
+++ config/aarch64/aarch64-linux.h	(working copy)
@@ -88,4 +88,7 @@
 #undef TARGET_BINDS_LOCAL_P
 #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
 
+/* Define this to be nonzero if static stack checking is supported.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 #endif  /* GCC_AARCH64_LINUX_H */
Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h	(revision 228512)
+++ config/aarch64/aarch64-protos.h	(working copy)
@@ -316,6 +316,7 @@ void aarch64_asm_output_labelref (FILE *
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 void aarch64_elf_asm_named_section (const char *, unsigned, tree);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
+const char * aarch64_output_probe_stack_range (rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 228512)
+++ config/aarch64/aarch64.c	(working copy)
@@ -76,6 +76,7 @@
 #include "sched-int.h"
 #include "cortex-a57-fma-steering.h"
 #include "target-globals.h"
+#include "common/common-target.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2144,6 +2145,167 @@ aarch64_libgcc_cmp_return_mode (void)
   return SImode;
 }
 
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+#if (PROBE_INTERVAL % 4096) != 0
+#error Cannot use indexed address calculation for stack probing
+#endif
+
+#if PROBE_INTERVAL > 4096
+#error Cannot use indexed addressing mode for stack probing
+#endif
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  */
+
+static void
+aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
+{
+  rtx reg9 = gen_rtx_REG (Pmode, 9);
+
+  /* The following code uses indexed address calculation on FIRST.  */
+  gcc_assert ((first % 4096) == 0);
+
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  */
+  if (size <= PROBE_INTERVAL)
+    {
+      emit_set_insn (reg9,
+		     plus_constant (Pmode, stack_pointer_rtx,
+					   -(first + PROBE_INTERVAL)));
+      emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - size));
+    }
+
+  /* The run-time loop is made up of 8 insns in the generic case while the
+     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
+  else if (size <= 4 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i, rem;
+
+      emit_set_insn (reg9,
+		     plus_constant (Pmode, stack_pointer_rtx,
+					   -(first + PROBE_INTERVAL)));
+      emit_stack_probe (reg9);
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
+	 it exceeds SIZE.  If only two probes are needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	{
+	  emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL));
+	  emit_stack_probe (reg9);
+	}
+
+      rem = size - (i - PROBE_INTERVAL);
+      if (rem > 256)
+	{
+	  emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL));
+	  emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - rem));
+	}
+      else
+	emit_stack_probe (plus_constant (Pmode, reg9, -rem));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size;
+      rtx reg10 = gen_rtx_REG (Pmode, 10);
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* TEST_ADDR = SP + FIRST.  */
+      emit_set_insn (reg9,
+		     plus_constant (Pmode, stack_pointer_rtx, -first));
+
+      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
+      emit_set_insn (reg10,
+		     plus_constant (Pmode, stack_pointer_rtx,
+				    -(first + rounded_size)));
+
+
+      /* Step 3: the loop
+
+	 do
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+	 while (TEST_ADDR != LAST_ADDR)
+
+	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+	 until it is equal to ROUNDED_SIZE.  */
+
+      emit_insn (gen_probe_stack_range (reg9, reg9, reg10));
+
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  HOST_WIDE_INT rem = size - rounded_size;
+
+	  if (rem > 256)
+	    {
+	      emit_set_insn (reg10,
+			     plus_constant (Pmode, reg10, -PROBE_INTERVAL));
+	      emit_stack_probe (plus_constant (Pmode, reg10,
+					       PROBE_INTERVAL - rem));
+	    }
+	  else
+	    emit_stack_probe (plus_constant (Pmode, reg10, -rem));
+	}
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
+   absolute addresses.  */
+
+const char *
+aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
+{
+  static int labelno = 0;
+  char loop_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+  /* Loop.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[0] = reg1;
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at TEST_ADDR.  */
+  output_asm_insn ("str\txzr, [%0]", xops);
+
+  /* Test if TEST_ADDR == LAST_ADDR.  */
+  xops[1] = reg2;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch.  */
+  fputs ("\tb.ne\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  return "";
+}
+
 static bool
 aarch64_frame_pointer_required (void)
 {
@@ -2544,6 +2706,18 @@ aarch64_expand_prologue (void)
   if (flag_stack_usage_info)
     current_function_static_stack_size = frame_size;
 
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    {
+      if (crtl->is_leaf && !cfun->calls_alloca)
+	{
+	  if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
+	    aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
+					    frame_size - STACK_CHECK_PROTECT);
+	}
+      else if (frame_size > 0)
+	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
+    }
+
   /* Store pairs and load pairs have a range only -512 to 504.  */
   if (offset >= 512)
     {
Index: config/aarch64/aarch64.md
===================================================================
--- config/aarch64/aarch64.md	(revision 228512)
+++ config/aarch64/aarch64.md	(working copy)
@@ -104,6 +104,7 @@ (define_c_enum "unspec" [
     UNSPEC_MB
     UNSPEC_NOP
     UNSPEC_PRLG_STK
+    UNSPEC_PROBE_STACK_RANGE
     UNSPEC_RBIT
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
@@ -134,6 +135,7 @@ (define_c_enum "unspecv" [
     UNSPECV_SET_FPCR		; Represent assign of FPCR content.
     UNSPECV_GET_FPSR		; Represent fetch of FPSR content.
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
+    UNSPECV_BLOCKAGE		; Represent a blockage
   ]
 )
 
@@ -4807,6 +4809,27 @@ (define_insn "stack_tie"
   [(set_attr "length" "0")]
 )
 
+;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
+;; all of memory.  This blocks insns from being moved across this point.
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "length" "0")
+   (set_attr "type" "block")]
+)
+
+(define_insn "probe_stack_range"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")]
+			     UNSPEC_PROBE_STACK_RANGE))]
+  ""
+{
+  return aarch64_output_probe_stack_range (operands[0], operands[2]);
+})
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 228512)
+++ config/arm/arm.c	(working copy)
@@ -21262,11 +21262,12 @@ arm_emit_probe_stack_range (HOST_WIDE_IN
 
       /* Step 3: the loop
 
-	 while (TEST_ADDR != LAST_ADDR)
+	 do
 	   {
 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
 	     probe at TEST_ADDR
 	   }
+	 while (TEST_ADDR != LAST_ADDR)
 
 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
 	 until it is equal to ROUNDED_SIZE.  */
@@ -21311,22 +21312,22 @@ output_probe_stack_range (rtx reg1, rtx
 
   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
 
+  /* Loop.  */
   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
 
-   /* Test if TEST_ADDR == LAST_ADDR.  */
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
   xops[0] = reg1;
-  xops[1] = reg2;
-  output_asm_insn ("cmp\t%0, %1", xops);
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub\t%0, %0, %1", xops);
 
-  if (TARGET_THUMB2)
-    fputs ("\tittt\tne\n", asm_out_file);
+  /* Probe at TEST_ADDR.  */
+  output_asm_insn ("str\tr0, [%0, #0]", xops);
 
-  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
-  xops[1] = GEN_INT (PROBE_INTERVAL);
-  output_asm_insn ("subne\t%0, %0, %1", xops);
+  /* Test if TEST_ADDR == LAST_ADDR.  */
+  xops[1] = reg2;
+  output_asm_insn ("cmp\t%0, %1", xops);
 
-  /* Probe at TEST_ADDR and branch.  */
-  output_asm_insn ("strne\tr0, [%0, #0]", xops);
+  /* Branch.  */
   fputs ("\tbne\t", asm_out_file);
   assemble_name_raw (asm_out_file, loop_lab);
   fputc ('\n', asm_out_file);
@@ -24869,7 +24870,7 @@ thumb1_expand_prologue (void)
 
   /* If we have a frame, then do stack checking.  FIXME: not implemented.  */
   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK && size)
-    sorry ("-fstack-check=specific for THUMB1");
+    sorry ("-fstack-check=specific for Thumb-1");
 
   amount = offsets->outgoing_args - offsets->saved_regs;
   amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
Index: config/arm/arm.md
===================================================================
--- config/arm/arm.md	(revision 228512)
+++ config/arm/arm.md	(working copy)
@@ -8262,9 +8262,7 @@ (define_insn "probe_stack"
   [(set (match_operand 0 "memory_operand" "=m")
         (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
   "TARGET_32BIT"
-{
-  return "str%?\\tr0, %0";
-}
+  "str%?\\tr0, %0"
   [(set_attr "type" "store1")
    (set_attr "predicable" "yes")]
 )

[-- Attachment #3: stack-checking.c --]
[-- Type: text/x-csrc, Size: 281 bytes --]

/* { dg-do run { target { *-*-linux* } } } */
/* { dg-options "-fstack-check" } */

int main(void)
{
  char *p;
  if (1)
    {
      char i[48];
      p = __builtin_alloca(8);
      p[0] = 1;
    }

  if (1)
    {
      char i[48], j[64];
      j[48] = 0;
    }

  return !p[0];
}

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-06 10:11       ` Eric Botcazou
@ 2015-10-06 13:43         ` Ramana Radhakrishnan
  2015-10-28 11:38           ` Eric Botcazou
  2015-10-07  8:15         ` Yao Qi
  2015-11-03 17:35         ` Richard Earnshaw
  2 siblings, 1 reply; 24+ messages in thread
From: Ramana Radhakrishnan @ 2015-10-06 13:43 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, James Greenhalgh, Marcus Shawcroft



On 06/10/15 11:11, Eric Botcazou wrote:
>> Thanks - I have no further comments on this patch. We probably need to
>> implement the same on AArch64 too in order to avoid similar problems.
> 
> Here's the implementation for aarch64, very similar but simpler since there is 
> no shortage of scratch registers; the only thing to note is the new blockage 
> pattern.  This was tested on real hardware but not with Linux, instead with 
> Darwin (experimental port of the toolchain to iOS) and makes it possible to 
> pass ACATS (Ada conformance testsuite which requires stack checking).
> 
> There is also a couple of tweaks for the ARM implementation: a cosmetic one 
> for the probe_stack pattern and one for the output_probe_stack_range loop.
> 
> 
> 2015-10-06  Tristan Gingold  <gingold@adacore.com>
>             Eric Botcazou  <ebotcazou@adacore.com>
> 
>         PR middle-end/65958
> 	* config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
> 	Declare.
> 	* config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
> 	UNSPEC_PROBE_STACK_RANGE.
> 	(blockage): New instruction.
> 	(probe_stack_range): Likewise.
> 	* config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
> 	function.
> 	(aarch64_output_probe_stack_range): Likewise.
> 	(aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
> 	static builtin stack checking is enabled.
> 	* config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
> 	Define.
> 
> 	* config/arm/arm.c (arm_emit_probe_stack_range): Adjust comment.
> 	(output_probe_stack_range): Rotate the loop and simplify.
> 	(thumb1_expand_prologue): Tweak sorry message.
> 	* config/arm/arm.md (probe_stack): Use bare string.
> 
> 
> 2015-10-06  Eric Botcazou  <ebotcazou@adacore.com>
> 
>         * gcc.target/aarch64/stack-checking.c: New test.
> 


Thanks - the arm backend changes are ok - but you need to wait for an AArch64 maintainer to review the AArch64 changes.

I've CC'd a couple of them on this.

Ramana

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-06 10:11       ` Eric Botcazou
  2015-10-06 13:43         ` Ramana Radhakrishnan
@ 2015-10-07  8:15         ` Yao Qi
  2015-10-07  9:10           ` Eric Botcazou
  2015-11-03 17:35         ` Richard Earnshaw
  2 siblings, 1 reply; 24+ messages in thread
From: Yao Qi @ 2015-10-07  8:15 UTC (permalink / raw)
  To: Eric Botcazou, Ramana Radhakrishnan; +Cc: gcc-patches

Hi Eric,

On 06/10/15 11:11, Eric Botcazou wrote:
> Here's the implementation for aarch64, very similar but simpler since there is
> no shortage of scratch registers; the only thing to note is the new blockage
> pattern.  This was tested on real hardware but not with Linux, instead with
> Darwin (experimental port of the toolchain to iOS) and makes it possible to
> pass ACATS (Ada conformance testsuite which requires stack checking).
>
> There is also a couple of tweaks for the ARM implementation: a cosmetic one
> for the probe_stack pattern and one for the output_probe_stack_range loop.

I assume that this patch (and arm patch) will change the instruction
sequences in prologue.  If so, do you have some examples about how
prologue is changed with this patch?  I need to adapt GDB prologue
analyser to these new instruction sequences.

-- 
Yao (齐尧)

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-07  8:15         ` Yao Qi
@ 2015-10-07  9:10           ` Eric Botcazou
  2015-10-07 10:42             ` Yao Qi
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Botcazou @ 2015-10-07  9:10 UTC (permalink / raw)
  To: Yao Qi; +Cc: gcc-patches, Ramana Radhakrishnan

> I assume that this patch (and arm patch) will change the instruction
> sequences in prologue.  If so, do you have some examples about how
> prologue is changed with this patch?  I need to adapt GDB prologue
> analyser to these new instruction sequences.

Yes, it will generate either individual probes or a probing loop before the 
frame is established when -fstack-check is passed.  For aarch64, a probe is a 
store based on x9 of the form:

	str xzr, [x9, #offset]

with preceding instructions to compute x9 from sp, typically:

 	sub x9, sp, #16384

A probing loop uses both x9 and x10:

	sub	x9, sp, #12288
	sub	x10, sp, #36864
LPSRL0:
	sub	x9, x9, 4096
	str	xzr, [x9]
	cmp	x9, x10
	b.ne	LPSRL0

with an optional last probe:

	str	xzr, [x10,#-16]

For arm, it's more convoluted because the base register may vary but, in 
simple cases (in particular if the function is not nested), it's IP:

	str r0, [ip, #offset]

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-07  9:10           ` Eric Botcazou
@ 2015-10-07 10:42             ` Yao Qi
  2015-10-07 17:38               ` Eric Botcazou
  0 siblings, 1 reply; 24+ messages in thread
From: Yao Qi @ 2015-10-07 10:42 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, Ramana Radhakrishnan

Hi Eric,
Thanks for the examples.  I am able to map these instructions in your
examples back to RTX in your patch.

On 07/10/15 10:09, Eric Botcazou wrote:
> Yes, it will generate either individual probes or a probing loop before the
> frame is established when -fstack-check is passed.  For aarch64, a probe is a
> store based on x9 of the form:
>
> 	str xzr, [x9, #offset]
>
> with preceding instructions to compute x9 from sp, typically:
>
>   	sub x9, sp, #16384
>

If I read aarch64_emit_probe_stack_range correctly, these two
instructions are generated when (size <= PROBE_INTERVAL).  If
size <= 4 * PROBE_INTERVAL, more instructions are generated,

         sub x9, sp, #16384
         str xzr, [x9]

         sub x9, x9, #PROBE_INTERVAL
         str xzr, [x9]
	... /* At most two instances of these two insn. */

         either
         sub x9, x9, #PROBE_INTERVAL
         str xzr, [x9, #offset]
         or
         str xzr, [x9, -16]

> A probing loop uses both x9 and x10:
>
> 	sub	x9, sp, #12288
> 	sub	x10, sp, #36864
> LPSRL0:
> 	sub	x9, x9, 4096
> 	str	xzr, [x9]
> 	cmp	x9, x10
> 	b.ne	LPSRL0
>

The probing loop is used when size > 4 * PROBE_INTERVAL

> with an optional last probe:
>
> 	str	xzr, [x10,#-16]

and there can be an optional instruction before the probe,

	sub x10, x10, #PROBE_INTERVAL

Let me know if my understanding above is wrong.

When I read the examples and your patch, I happen to see a
nit in ChangeLog entry,

> 2015-10-06  Tristan Gingold  <gingold@adacore.com>
>             Eric Botcazou  <ebotcazou@adacore.com>
>
>         PR middle-end/65958
> 	* config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
> 	Declare.

s/aarch64_output_probe_stack-range/aarch64_output_probe_stack_range

-- 
Yao (齐尧)

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-07 10:42             ` Yao Qi
@ 2015-10-07 17:38               ` Eric Botcazou
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-10-07 17:38 UTC (permalink / raw)
  To: Yao Qi; +Cc: gcc-patches, Ramana Radhakrishnan

> If I read aarch64_emit_probe_stack_range correctly, these two
> instructions are generated when (size <= PROBE_INTERVAL).  If
> size <= 4 * PROBE_INTERVAL, more instructions are generated,
> 
>          sub x9, sp, #16384
>          str xzr, [x9]
> 
>          sub x9, x9, #PROBE_INTERVAL
>          str xzr, [x9]
> 	... /* At most two instances of these two insn. */
> 
>          either
>          sub x9, x9, #PROBE_INTERVAL
>          str xzr, [x9, #offset]
>          or
>          str xzr, [x9, -16]
> 
> > A probing loop uses both x9 and x10:
> > 	sub	x9, sp, #12288
> > 	sub	x10, sp, #36864
> > 
> > LPSRL0:
> > 	sub	x9, x9, 4096
> > 	str	xzr, [x9]
> > 	cmp	x9, x10
> > 	b.ne	LPSRL0
> 
> The probing loop is used when size > 4 * PROBE_INTERVAL
> 
> > with an optional last probe:
> > 	str	xzr, [x10,#-16]
> 
> and there can be an optional instruction before the probe,
> 
> 	sub x10, x10, #PROBE_INTERVAL
> 
> Let me know if my understanding above is wrong.

That's correct, probes can come with a 'sub' instruction just before.

> s/aarch64_output_probe_stack-range/aarch64_output_probe_stack_range

Thanks, will fix.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-06 13:43         ` Ramana Radhakrishnan
@ 2015-10-28 11:38           ` Eric Botcazou
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-10-28 11:38 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches, James Greenhalgh, Marcus Shawcroft

> Thanks - the arm backend changes are ok - but you need to wait for an
> AArch64 maintainer to review the AArch64 changes.

I installed the ARM changes long ago but the Aarch64 ones are still pending.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-10-06 10:11       ` Eric Botcazou
  2015-10-06 13:43         ` Ramana Radhakrishnan
  2015-10-07  8:15         ` Yao Qi
@ 2015-11-03 17:35         ` Richard Earnshaw
  2015-11-03 18:05           ` Eric Botcazou
  2015-11-16 20:01           ` Eric Botcazou
  2 siblings, 2 replies; 24+ messages in thread
From: Richard Earnshaw @ 2015-11-03 17:35 UTC (permalink / raw)
  To: Eric Botcazou, Ramana Radhakrishnan; +Cc: gcc-patches

On 06/10/15 11:11, Eric Botcazou wrote:
>> Thanks - I have no further comments on this patch. We probably need to
>> implement the same on AArch64 too in order to avoid similar problems.
> 
> Here's the implementation for aarch64, very similar but simpler since there is 
> no shortage of scratch registers; the only thing to note is the new blockage 
> pattern.  This was tested on real hardware but not with Linux, instead with 
> Darwin (experimental port of the toolchain to iOS) and makes it possible to 
> pass ACATS (Ada conformance testsuite which requires stack checking).
> 
> There is also a couple of tweaks for the ARM implementation: a cosmetic one 
> for the probe_stack pattern and one for the output_probe_stack_range loop.
> 
> 
> 2015-10-06  Tristan Gingold  <gingold@adacore.com>
>             Eric Botcazou  <ebotcazou@adacore.com>
> 
>         PR middle-end/65958
> 	* config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
> 	Declare.
> 	* config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
> 	UNSPEC_PROBE_STACK_RANGE.
> 	(blockage): New instruction.
> 	(probe_stack_range): Likewise.
> 	* config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
> 	function.
> 	(aarch64_output_probe_stack_range): Likewise.
> 	(aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
> 	static builtin stack checking is enabled.
> 	* config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
> 	Define.
> 
> 	* config/arm/arm.c (arm_emit_probe_stack_range): Adjust comment.
> 	(output_probe_stack_range): Rotate the loop and simplify.
> 	(thumb1_expand_prologue): Tweak sorry message.
> 	* config/arm/arm.md (probe_stack): Use bare string.
> 
> 
> 2015-10-06  Eric Botcazou  <ebotcazou@adacore.com>
> 
>         * gcc.target/aarch64/stack-checking.c: New test.
> 

Unless there really is common code between the two patches, this should
be separated out into two posts, one for ARM and one for AArch64.

More comments inline.

> 
> pr65958-2.diff
> 
> 
> Index: config/aarch64/aarch64-linux.h
> ===================================================================
> --- config/aarch64/aarch64-linux.h	(revision 228512)
> +++ config/aarch64/aarch64-linux.h	(working copy)
> @@ -88,4 +88,7 @@
>  #undef TARGET_BINDS_LOCAL_P
>  #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
>  
> +/* Define this to be nonzero if static stack checking is supported.  */
> +#define STACK_CHECK_STATIC_BUILTIN 1
> +
>  #endif  /* GCC_AARCH64_LINUX_H */
> Index: config/aarch64/aarch64-protos.h
> ===================================================================
> --- config/aarch64/aarch64-protos.h	(revision 228512)
> +++ config/aarch64/aarch64-protos.h	(working copy)
> @@ -316,6 +316,7 @@ void aarch64_asm_output_labelref (FILE *
>  void aarch64_cpu_cpp_builtins (cpp_reader *);
>  void aarch64_elf_asm_named_section (const char *, unsigned, tree);
>  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
> +const char * aarch64_output_probe_stack_range (rtx, rtx);
>  void aarch64_err_no_fpadvsimd (machine_mode, const char *);
>  void aarch64_expand_epilogue (bool);
>  void aarch64_expand_mov_immediate (rtx, rtx);
> Index: config/aarch64/aarch64.c
> ===================================================================
> --- config/aarch64/aarch64.c	(revision 228512)
> +++ config/aarch64/aarch64.c	(working copy)
> @@ -76,6 +76,7 @@
>  #include "sched-int.h"
>  #include "cortex-a57-fma-steering.h"
>  #include "target-globals.h"
> +#include "common/common-target.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -2144,6 +2145,167 @@ aarch64_libgcc_cmp_return_mode (void)
>    return SImode;
>  }
>  
> +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
> +
> +#if (PROBE_INTERVAL % 4096) != 0
> +#error Cannot use indexed address calculation for stack probing
> +#endif
> +
> +#if PROBE_INTERVAL > 4096
> +#error Cannot use indexed addressing mode for stack probing
> +#endif
> +

Hmm, so if PROBE_INTERVAL != 4096 we barf!

While that's safe and probably right for Linux, on some OSes there might
be a minimum page size of 16k or even 64k.  It would be nice if we could
support that.

> +/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
> +   inclusive.  These are offsets from the current stack pointer.  */
> +
> +static void
> +aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
> +{
> +  rtx reg9 = gen_rtx_REG (Pmode, 9);

Ug!  Manifest constants should be moved to pre-defines.
PROBE_STACK_BASE_REG?

> +
> +  /* The following code uses indexed address calculation on FIRST.  */
> +  gcc_assert ((first % 4096) == 0);

where's 4096 come from?

> +
> +  /* See if we have a constant small number of probes to generate.  If so,
> +     that's the easy case.  */
> +  if (size <= PROBE_INTERVAL)
> +    {
> +      emit_set_insn (reg9,
> +		     plus_constant (Pmode, stack_pointer_rtx,
> +					   -(first + PROBE_INTERVAL)));
> +      emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - size));
> +    }
> +
> +  /* The run-time loop is made up of 8 insns in the generic case while the
> +     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
> +  else if (size <= 4 * PROBE_INTERVAL)
> +    {
> +      HOST_WIDE_INT i, rem;
> +
> +      emit_set_insn (reg9,
> +		     plus_constant (Pmode, stack_pointer_rtx,
> +					   -(first + PROBE_INTERVAL)));
> +      emit_stack_probe (reg9);
> +
> +      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
> +	 it exceeds SIZE.  If only two probes are needed, this will not
> +	 generate any code.  Then probe at FIRST + SIZE.  */
> +      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
> +	{
> +	  emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL));
> +	  emit_stack_probe (reg9);
> +	}
> +
> +      rem = size - (i - PROBE_INTERVAL);
> +      if (rem > 256)
> +	{
> +	  emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL));
> +	  emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - rem));
> +	}
> +      else
> +	emit_stack_probe (plus_constant (Pmode, reg9, -rem));
> +    }
> +
> +  /* Otherwise, do the same as above, but in a loop.  Note that we must be
> +     extra careful with variables wrapping around because we might be at
> +     the very top (or the very bottom) of the address space and we have
> +     to be able to handle this case properly; in particular, we use an
> +     equality test for the loop condition.  */
> +  else
> +    {
> +      HOST_WIDE_INT rounded_size;
> +      rtx reg10 = gen_rtx_REG (Pmode, 10);

More manifest constants.

> +
> +      /* Step 1: round SIZE to the previous multiple of the interval.  */
> +
> +      rounded_size = size & -PROBE_INTERVAL;
> +
> +
> +      /* Step 2: compute initial and final value of the loop counter.  */
> +
> +      /* TEST_ADDR = SP + FIRST.  */
> +      emit_set_insn (reg9,
> +		     plus_constant (Pmode, stack_pointer_rtx, -first));
> +
> +      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
> +      emit_set_insn (reg10,
> +		     plus_constant (Pmode, stack_pointer_rtx,
> +				    -(first + rounded_size)));
> +
> +
> +      /* Step 3: the loop
> +
> +	 do
> +	   {
> +	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
> +	     probe at TEST_ADDR
> +	   }
> +	 while (TEST_ADDR != LAST_ADDR)
> +
> +	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
> +	 until it is equal to ROUNDED_SIZE.  */
> +
> +      emit_insn (gen_probe_stack_range (reg9, reg9, reg10));
> +
> +
> +      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
> +	 that SIZE is equal to ROUNDED_SIZE.  */
> +
> +      if (size != rounded_size)
> +	{
> +	  HOST_WIDE_INT rem = size - rounded_size;
> +
> +	  if (rem > 256)
> +	    {
> +	      emit_set_insn (reg10,
> +			     plus_constant (Pmode, reg10, -PROBE_INTERVAL));
> +	      emit_stack_probe (plus_constant (Pmode, reg10,
> +					       PROBE_INTERVAL - rem));
> +	    }
> +	  else
> +	    emit_stack_probe (plus_constant (Pmode, reg10, -rem));
> +	}
> +    }
> +
> +  /* Make sure nothing is scheduled before we are done.  */
> +  emit_insn (gen_blockage ());
> +}
> +
> +/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
> +   absolute addresses.  */
> +
> +const char *
> +aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> +{
> +  static int labelno = 0;
> +  char loop_lab[32];
> +  rtx xops[2];
> +
> +  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
> +
> +  /* Loop.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
> +
> +  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
> +  xops[0] = reg1;
> +  xops[1] = GEN_INT (PROBE_INTERVAL);
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Probe at TEST_ADDR.  */
> +  output_asm_insn ("str\txzr, [%0]", xops);
> +
> +  /* Test if TEST_ADDR == LAST_ADDR.  */
> +  xops[1] = reg2;
> +  output_asm_insn ("cmp\t%0, %1", xops);
> +
> +  /* Branch.  */
> +  fputs ("\tb.ne\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  return "";
> +}
> +
>  static bool
>  aarch64_frame_pointer_required (void)
>  {
> @@ -2544,6 +2706,18 @@ aarch64_expand_prologue (void)
>    if (flag_stack_usage_info)
>      current_function_static_stack_size = frame_size;
>  
> +  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
> +    {
> +      if (crtl->is_leaf && !cfun->calls_alloca)
> +	{
> +	  if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
> +	    aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
> +					    frame_size - STACK_CHECK_PROTECT);
> +	}
> +      else if (frame_size > 0)
> +	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
> +    }
> +
>    /* Store pairs and load pairs have a range only -512 to 504.  */
>    if (offset >= 512)
>      {
> Index: config/aarch64/aarch64.md
> ===================================================================
> --- config/aarch64/aarch64.md	(revision 228512)
> +++ config/aarch64/aarch64.md	(working copy)
> @@ -104,6 +104,7 @@ (define_c_enum "unspec" [
>      UNSPEC_MB
>      UNSPEC_NOP
>      UNSPEC_PRLG_STK
> +    UNSPEC_PROBE_STACK_RANGE
>      UNSPEC_RBIT
>      UNSPEC_SISD_NEG
>      UNSPEC_SISD_SSHL
> @@ -134,6 +135,7 @@ (define_c_enum "unspecv" [
>      UNSPECV_SET_FPCR		; Represent assign of FPCR content.
>      UNSPECV_GET_FPSR		; Represent fetch of FPSR content.
>      UNSPECV_SET_FPSR		; Represent assign of FPSR content.
> +    UNSPECV_BLOCKAGE		; Represent a blockage
>    ]
>  )
>  
> @@ -4807,6 +4809,27 @@ (define_insn "stack_tie"
>    [(set_attr "length" "0")]
>  )
>  
> +;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
> +;; all of memory.  This blocks insns from being moved across this point.
> +
> +(define_insn "blockage"
> +  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
> +  ""
> +  ""
> +  [(set_attr "length" "0")
> +   (set_attr "type" "block")]
> +)
> +
> +(define_insn "probe_stack_range"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> +			     (match_operand:DI 2 "register_operand" "r")]
> +			     UNSPEC_PROBE_STACK_RANGE))]
> +  ""
> +{
> +  return aarch64_output_probe_stack_range (operands[0], operands[2]);
> +})
> +

This should be annotated with the sequence length.

>  ;; Named pattern for expanding thread pointer reference.
>  (define_expand "get_thread_pointerdi"
>    [(match_operand:DI 0 "register_operand" "=r")]
> Index: config/arm/arm.c
> ===================================================================
> --- config/arm/arm.c	(revision 228512)
> +++ config/arm/arm.c	(working copy)
> @@ -21262,11 +21262,12 @@ arm_emit_probe_stack_range (HOST_WIDE_IN
>  
>        /* Step 3: the loop
>  
> -	 while (TEST_ADDR != LAST_ADDR)
> +	 do
>  	   {
>  	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
>  	     probe at TEST_ADDR
>  	   }
> +	 while (TEST_ADDR != LAST_ADDR)
>  
>  	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
>  	 until it is equal to ROUNDED_SIZE.  */
> @@ -21311,22 +21312,22 @@ output_probe_stack_range (rtx reg1, rtx
>  
>    ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
>  
> +  /* Loop.  */
>    ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
>  
> -   /* Test if TEST_ADDR == LAST_ADDR.  */
> +  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
>    xops[0] = reg1;
> -  xops[1] = reg2;
> -  output_asm_insn ("cmp\t%0, %1", xops);
> +  xops[1] = GEN_INT (PROBE_INTERVAL);
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
>  
> -  if (TARGET_THUMB2)
> -    fputs ("\tittt\tne\n", asm_out_file);
> +  /* Probe at TEST_ADDR.  */
> +  output_asm_insn ("str\tr0, [%0, #0]", xops);
>  
> -  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
> -  xops[1] = GEN_INT (PROBE_INTERVAL);
> -  output_asm_insn ("subne\t%0, %0, %1", xops);
> +  /* Test if TEST_ADDR == LAST_ADDR.  */
> +  xops[1] = reg2;
> +  output_asm_insn ("cmp\t%0, %1", xops);
>  
> -  /* Probe at TEST_ADDR and branch.  */
> -  output_asm_insn ("strne\tr0, [%0, #0]", xops);
> +  /* Branch.  */
>    fputs ("\tbne\t", asm_out_file);
>    assemble_name_raw (asm_out_file, loop_lab);
>    fputc ('\n', asm_out_file);
> @@ -24869,7 +24870,7 @@ thumb1_expand_prologue (void)
>  
>    /* If we have a frame, then do stack checking.  FIXME: not implemented.  */
>    if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK && size)
> -    sorry ("-fstack-check=specific for THUMB1");
> +    sorry ("-fstack-check=specific for Thumb-1");
>  
>    amount = offsets->outgoing_args - offsets->saved_regs;
>    amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
> Index: config/arm/arm.md
> ===================================================================
> --- config/arm/arm.md	(revision 228512)
> +++ config/arm/arm.md	(working copy)
> @@ -8262,9 +8262,7 @@ (define_insn "probe_stack"
>    [(set (match_operand 0 "memory_operand" "=m")
>          (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
>    "TARGET_32BIT"
> -{
> -  return "str%?\\tr0, %0";
> -}
> +  "str%?\\tr0, %0"
>    [(set_attr "type" "store1")
>     (set_attr "predicable" "yes")]
>  )
> 
> 
> stack-checking.c
> 
> 
> /* { dg-do run { target { *-*-linux* } } } */
> /* { dg-options "-fstack-check" } */
> 
> int main(void)
> {
>   char *p;
>   if (1)
>     {
>       char i[48];
>       p = __builtin_alloca(8);
>       p[0] = 1;
>     }
> 
>   if (1)
>     {
>       char i[48], j[64];
>       j[48] = 0;
>     }
> 
>   return !p[0];
> }
> 

R.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-11-03 17:35         ` Richard Earnshaw
@ 2015-11-03 18:05           ` Eric Botcazou
  2015-11-03 21:51             ` Eric Botcazou
  2015-11-16 20:01           ` Eric Botcazou
  1 sibling, 1 reply; 24+ messages in thread
From: Eric Botcazou @ 2015-11-03 18:05 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, Ramana Radhakrishnan

> Unless there really is common code between the two patches, this should
> be separated out into two posts, one for ARM and one for AArch64.

The ARM bits were approved by Ramana and installed right away.

> Hmm, so if PROBE_INTERVAL != 4096 we barf!

Yes, but that's not usual, ARM and SPARC have it too, 4096 happens to be the 
limit of reg+off addressing mode on several architectures.

> While that's safe and probably right for Linux, on some OSes there might
> be a minimum page size of 16k or even 64k.  It would be nice if we could
> support that.

OK, but we cannot test anything at the moment.

> Ug!  Manifest constants should be moved to pre-defines.
> PROBE_STACK_BASE_REG?

OK.

> > +
> > +  /* The following code uses indexed address calculation on FIRST.  */
> > +  gcc_assert ((first % 4096) == 0);
> 
> where's 4096 come from?

It's the same constraint as above:

#if (PROBE_INTERVAL % 4096) != 0
#error Cannot use indexed address calculation for stack probing
#endif

to be able to use the 12-bit shifted immediate instructions. 

> More manifest constants.

Yeah, consistency first. ;-)

> This should be annotated with the sequence length.

OK, thanks for the review, I'll adjust.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-11-03 18:05           ` Eric Botcazou
@ 2015-11-03 21:51             ` Eric Botcazou
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-11-03 21:51 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, Ramana Radhakrishnan

> Yes, but that's not usual, ARM and SPARC have it too, 4096 happens to be the
> limit of reg+off addressing mode on several architectures.

... not unusual...

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-11-03 17:35         ` Richard Earnshaw
  2015-11-03 18:05           ` Eric Botcazou
@ 2015-11-16 20:01           ` Eric Botcazou
  2015-11-25  7:56             ` Eric Botcazou
  2015-12-03 11:08             ` Richard Earnshaw
  1 sibling, 2 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-11-16 20:01 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 1296 bytes --]

> More comments inline.

Revised version attached, which addresses all your comments and in particular 
removes the

+#if PROBE_INTERVAL > 4096
+#error Cannot use indexed addressing mode for stack probing
+#endif

compile-time assertion.  It generates the same code for PROBE_INTERVAL == 4096 
as before and it generates code that can be assembled for 8192.

Tested on Aarch64/Linux, OK for the mainline?


2015-11-16  Tristan Gingold  <gingold@adacore.com>
            Eric Botcazou  <ebotcazou@adacore.com>

        PR middle-end/65958
        * config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
        Declare.
        * config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
        UNSPEC_PROBE_STACK_RANGE.
        (blockage): New instruction.
        (probe_stack_range): Likewise.
        * config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
        function.
        (aarch64_output_probe_stack_range): Likewise.
        (aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
        static builtin stack checking is enabled.
        * config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
        Define.


2015-11-16  Eric Botcazou  <ebotcazou@adacore.com>

        * gcc.target/aarch64/stack-checking.c: New test.

-- 
Eric Botcazou

[-- Attachment #2: stack-checking.c --]
[-- Type: text/x-csrc, Size: 281 bytes --]

/* { dg-do run { target { *-*-linux* } } } */
/* { dg-options "-fstack-check" } */

int main(void)
{
  char *p;
  if (1)
    {
      char i[48];
      p = __builtin_alloca(8);
      p[0] = 1;
    }

  if (1)
    {
      char i[48], j[64];
      j[32] = 0;
    }

  return !p[0];
}

[-- Attachment #3: pr65958-3.diff --]
[-- Type: text/x-patch, Size: 8930 bytes --]

Index: config/aarch64/aarch64-linux.h
===================================================================
--- config/aarch64/aarch64-linux.h	(revision 230397)
+++ config/aarch64/aarch64-linux.h	(working copy)
@@ -88,4 +88,7 @@
 #undef TARGET_BINDS_LOCAL_P
 #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
 
+/* Define this to be nonzero if static stack checking is supported.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 #endif  /* GCC_AARCH64_LINUX_H */
Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h	(revision 230397)
+++ config/aarch64/aarch64-protos.h	(working copy)
@@ -340,6 +340,7 @@ void aarch64_asm_output_labelref (FILE *
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 void aarch64_elf_asm_named_section (const char *, unsigned, tree);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
+const char * aarch64_output_probe_stack_range (rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 230397)
+++ config/aarch64/aarch64.c	(working copy)
@@ -62,6 +62,7 @@
 #include "sched-int.h"
 #include "cortex-a57-fma-steering.h"
 #include "target-globals.h"
+#include "common/common-target.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2151,6 +2152,169 @@ aarch64_libgcc_cmp_return_mode (void)
   return SImode;
 }
 
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+/* We use the 12-bit shifted immediate arithmetic instructions so values
+   must be multiple of (1 << 12), i.e. 4096.  */
+#if (PROBE_INTERVAL % 4096) != 0
+#error Cannot use simple address calculation for stack probing
+#endif
+
+/* The pair of scratch registers used for stack probing.  */
+#define PROBE_STACK_FIRST_REG  9
+#define PROBE_STACK_SECOND_REG 10
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  */
+
+static void
+aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
+{
+  rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
+
+  /* See the same assertion on PROBE_INTERVAL above.  */
+  gcc_assert ((first % 4096) == 0);
+
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  */
+  if (size <= PROBE_INTERVAL)
+    {
+      const HOST_WIDE_INT base = ROUND_UP (size, 4096);
+      emit_set_insn (reg1,
+		     plus_constant (Pmode, stack_pointer_rtx,
+					   -(first + base)));
+      emit_stack_probe (plus_constant (Pmode, reg1, base - size));
+    }
+
+  /* The run-time loop is made up of 8 insns in the generic case while the
+     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
+  else if (size <= 4 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i, rem;
+
+      emit_set_insn (reg1,
+		     plus_constant (Pmode, stack_pointer_rtx,
+					   -(first + PROBE_INTERVAL)));
+      emit_stack_probe (reg1);
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
+	 it exceeds SIZE.  If only two probes are needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	{
+	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -PROBE_INTERVAL));
+	  emit_stack_probe (reg1);
+	}
+
+      rem = size - (i - PROBE_INTERVAL);
+      if (rem > 256)
+	{
+	  const HOST_WIDE_INT base = ROUND_UP (rem, 4096);
+	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
+	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
+	}
+      else
+	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* TEST_ADDR = SP + FIRST.  */
+      emit_set_insn (reg1,
+		     plus_constant (Pmode, stack_pointer_rtx, -first));
+
+      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
+      emit_set_insn (reg2,
+		     plus_constant (Pmode, stack_pointer_rtx,
+				    -(first + rounded_size)));
+
+
+      /* Step 3: the loop
+
+	 do
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+	 while (TEST_ADDR != LAST_ADDR)
+
+	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+	 until it is equal to ROUNDED_SIZE.  */
+
+      emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
+
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  HOST_WIDE_INT rem = size - rounded_size;
+
+	  if (rem > 256)
+	    {
+	      const HOST_WIDE_INT base = ROUND_UP (rem, 4096);
+	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
+	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
+	    }
+	  else
+	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
+	}
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
+   absolute addresses.  */
+
+const char *
+aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
+{
+  static int labelno = 0;
+  char loop_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+  /* Loop.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[0] = reg1;
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at TEST_ADDR.  */
+  output_asm_insn ("str\txzr, [%0]", xops);
+
+  /* Test if TEST_ADDR == LAST_ADDR.  */
+  xops[1] = reg2;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch.  */
+  fputs ("\tb.ne\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  return "";
+}
+
 static bool
 aarch64_frame_pointer_required (void)
 {
@@ -2551,6 +2715,18 @@ aarch64_expand_prologue (void)
   if (flag_stack_usage_info)
     current_function_static_stack_size = frame_size;
 
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    {
+      if (crtl->is_leaf && !cfun->calls_alloca)
+	{
+	  if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
+	    aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
+					    frame_size - STACK_CHECK_PROTECT);
+	}
+      else if (frame_size > 0)
+	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
+    }
+
   /* Store pairs and load pairs have a range only -512 to 504.  */
   if (offset >= 512)
     {
Index: config/aarch64/aarch64.md
===================================================================
--- config/aarch64/aarch64.md	(revision 230397)
+++ config/aarch64/aarch64.md	(working copy)
@@ -104,6 +104,7 @@ (define_c_enum "unspec" [
     UNSPEC_MB
     UNSPEC_NOP
     UNSPEC_PRLG_STK
+    UNSPEC_PROBE_STACK_RANGE
     UNSPEC_RBIT
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
@@ -137,6 +138,7 @@ (define_c_enum "unspecv" [
     UNSPECV_SET_FPCR		; Represent assign of FPCR content.
     UNSPECV_GET_FPSR		; Represent fetch of FPSR content.
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
+    UNSPECV_BLOCKAGE		; Represent a blockage
   ]
 )
 
@@ -4851,6 +4853,29 @@ (define_insn "stack_tie"
   [(set_attr "length" "0")]
 )
 
+;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
+;; all of memory.  This blocks insns from being moved across this point.
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "length" "0")
+   (set_attr "type" "block")]
+)
+
+(define_insn "probe_stack_range"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")]
+			     UNSPEC_PROBE_STACK_RANGE))]
+  ""
+{
+  return aarch64_output_probe_stack_range (operands[0], operands[2]);
+}
+  [(set_attr "length" "32")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-11-16 20:01           ` Eric Botcazou
@ 2015-11-25  7:56             ` Eric Botcazou
  2015-12-03 11:08             ` Richard Earnshaw
  1 sibling, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-11-25  7:56 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, Ramana Radhakrishnan

Richard,

> Revised version attached, which addresses all your comments and in
> particular removes the
> 
> +#if PROBE_INTERVAL > 4096
> +#error Cannot use indexed addressing mode for stack probing
> +#endif
> 
> compile-time assertion.  

Any objection to me applying this revised version?  I'd like to close the PR.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-11-16 20:01           ` Eric Botcazou
  2015-11-25  7:56             ` Eric Botcazou
@ 2015-12-03 11:08             ` Richard Earnshaw
  2015-12-03 12:20               ` Eric Botcazou
  1 sibling, 1 reply; 24+ messages in thread
From: Richard Earnshaw @ 2015-12-03 11:08 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, Ramana Radhakrishnan

Sorry for the delay, very busy on other things these days...

On 16/11/15 20:00, Eric Botcazou wrote:
>> More comments inline.
>
> Revised version attached, which addresses all your comments and in
particular
> removes the
>
> +#if PROBE_INTERVAL > 4096
> +#error Cannot use indexed addressing mode for stack probing
> +#endif
>
> compile-time assertion.  It generates the same code for PROBE_INTERVAL
== 4096
> as before and it generates code that can be assembled for 8192.
>
> Tested on Aarch64/Linux, OK for the mainline?
>

> +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
> +
> +/* We use the 12-bit shifted immediate arithmetic instructions so values
> +   must be multiple of (1 << 12), i.e. 4096.  */
> +#if (PROBE_INTERVAL % 4096) != 0

I can understand this restriction, but...

> +  /* See the same assertion on PROBE_INTERVAL above.  */
> +  gcc_assert ((first % 4096) == 0);

... why isn't this a test that FIRST is aligned to PROBE_INTERVAL?

> +  /* See if we have a constant small number of probes to generate.
If so,
> +     that's the easy case.  */
> +  if (size <= PROBE_INTERVAL)
> +    {
> +      const HOST_WIDE_INT base = ROUND_UP (size, 4096);
> +      emit_set_insn (reg1,

blank line between declarations and code. Also, can we come up with a
suitable define for 4096 here that expresses the context and then use
that consistently through the remainder of this function?

> +(define_insn "probe_stack_range"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> +			     (match_operand:DI 2 "register_operand" "r")]
> +			     UNSPEC_PROBE_STACK_RANGE))]

I think this should really use PTRmode, so that it's ILP32 ready (I'm
not going to ask you to make sure that works though, since I suspect
there are still other issues to resolve with ILP32 at this time).

R.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-12-03 11:08             ` Richard Earnshaw
@ 2015-12-03 12:20               ` Eric Botcazou
  2015-12-04  9:39                 ` Marcus Shawcroft
  2015-12-04 13:49                 ` Richard Earnshaw
  0 siblings, 2 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-12-03 12:20 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 1823 bytes --]

> I can understand this restriction, but...
> 
> > +  /* See the same assertion on PROBE_INTERVAL above.  */
> > +  gcc_assert ((first % 4096) == 0);
> 
> ... why isn't this a test that FIRST is aligned to PROBE_INTERVAL?

Because that isn't guaranteed, FIRST is related to the size of the protection 
area while PROBE_INTERVAL is related to the page size.

> blank line between declarations and code. Also, can we come up with a
> suitable define for 4096 here that expresses the context and then use
> that consistently through the remainder of this function?

OK, let's use ARITH_BASE.

> > +(define_insn "probe_stack_range"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> > +			     (match_operand:DI 2 "register_operand" "r")]
> > +			     UNSPEC_PROBE_STACK_RANGE))]
> 
> I think this should really use PTRmode, so that it's ILP32 ready (I'm
> not going to ask you to make sure that works though, since I suspect
> there are still other issues to resolve with ILP32 at this time).

Done.  Manually tested for now, I'll fully test it if approved.


        PR middle-end/65958
        * config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
        Declare.
        * config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
        UNSPEC_PROBE_STACK_RANGE.
        (blockage): New instruction.
        (probe_stack_range_<PTR:mode>): Likewise.
        * config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
        function.
        (aarch64_output_probe_stack_range): Likewise.
        (aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
        static builtin stack checking is enabled.
        * config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
        Define.

-- 
Eric Botcazou

[-- Attachment #2: pr65958-2c.diff --]
[-- Type: text/x-patch, Size: 9167 bytes --]

Index: config/aarch64/aarch64-linux.h
===================================================================
--- config/aarch64/aarch64-linux.h	(revision 231206)
+++ config/aarch64/aarch64-linux.h	(working copy)
@@ -88,4 +88,7 @@
 #undef TARGET_BINDS_LOCAL_P
 #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
 
+/* Define this to be nonzero if static stack checking is supported.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 #endif  /* GCC_AARCH64_LINUX_H */
Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h	(revision 231206)
+++ config/aarch64/aarch64-protos.h	(working copy)
@@ -340,6 +340,7 @@ void aarch64_asm_output_labelref (FILE *
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 void aarch64_elf_asm_named_section (const char *, unsigned, tree);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
+const char * aarch64_output_probe_stack_range (rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 231206)
+++ config/aarch64/aarch64.c	(working copy)
@@ -62,6 +62,7 @@
 #include "sched-int.h"
 #include "cortex-a57-fma-steering.h"
 #include "target-globals.h"
+#include "common/common-target.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2183,6 +2184,179 @@ aarch64_libgcc_cmp_return_mode (void)
   return SImode;
 }
 
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+/* We use the 12-bit shifted immediate arithmetic instructions so values
+   must be multiple of (1 << 12), i.e. 4096.  */
+#define ARITH_BASE 4096
+
+#if (PROBE_INTERVAL % ARITH_BASE) != 0
+#error Cannot use simple address calculation for stack probing
+#endif
+
+/* The pair of scratch registers used for stack probing.  */
+#define PROBE_STACK_FIRST_REG  9
+#define PROBE_STACK_SECOND_REG 10
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  */
+
+static void
+aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
+{
+  rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
+
+  /* See the same assertion on PROBE_INTERVAL above.  */
+  gcc_assert ((first % ARITH_BASE) == 0);
+
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  */
+  if (size <= PROBE_INTERVAL)
+    {
+      const HOST_WIDE_INT base = ROUND_UP (size, ARITH_BASE);
+
+      emit_set_insn (reg1,
+		     plus_constant (ptr_mode,
+				    stack_pointer_rtx, -(first + base)));
+      emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
+    }
+
+  /* The run-time loop is made up of 8 insns in the generic case while the
+     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
+  else if (size <= 4 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i, rem;
+
+      emit_set_insn (reg1,
+		     plus_constant (ptr_mode,
+				    stack_pointer_rtx,
+				    -(first + PROBE_INTERVAL)));
+      emit_stack_probe (reg1);
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
+	 it exceeds SIZE.  If only two probes are needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	{
+	  emit_set_insn (reg1,
+			 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
+	  emit_stack_probe (reg1);
+	}
+
+      rem = size - (i - PROBE_INTERVAL);
+      if (rem > 256)
+	{
+	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_BASE);
+
+	  emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
+	  emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
+	}
+      else
+	emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* TEST_ADDR = SP + FIRST.  */
+      emit_set_insn (reg1,
+		     plus_constant (ptr_mode, stack_pointer_rtx, -first));
+
+      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
+      emit_set_insn (reg2,
+		     plus_constant (ptr_mode, stack_pointer_rtx,
+				    -(first + rounded_size)));
+
+
+      /* Step 3: the loop
+
+	 do
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+	 while (TEST_ADDR != LAST_ADDR)
+
+	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+	 until it is equal to ROUNDED_SIZE.  */
+
+      if (ptr_mode == DImode)
+	emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
+      else
+	emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
+
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  HOST_WIDE_INT rem = size - rounded_size;
+
+	  if (rem > 256)
+	    {
+	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_BASE);
+
+	      emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
+	      emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
+	    }
+	  else
+	    emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
+	}
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
+   absolute addresses.  */
+
+const char *
+aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
+{
+  static int labelno = 0;
+  char loop_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+  /* Loop.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[0] = reg1;
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at TEST_ADDR.  */
+  output_asm_insn ("str\txzr, [%0]", xops);
+
+  /* Test if TEST_ADDR == LAST_ADDR.  */
+  xops[1] = reg2;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch.  */
+  fputs ("\tb.ne\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  return "";
+}
+
 static bool
 aarch64_frame_pointer_required (void)
 {
@@ -2583,6 +2757,18 @@ aarch64_expand_prologue (void)
   if (flag_stack_usage_info)
     current_function_static_stack_size = frame_size;
 
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    {
+      if (crtl->is_leaf && !cfun->calls_alloca)
+	{
+	  if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
+	    aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
+					    frame_size - STACK_CHECK_PROTECT);
+	}
+      else if (frame_size > 0)
+	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
+    }
+
   /* Store pairs and load pairs have a range only -512 to 504.  */
   if (offset >= 512)
     {
Index: config/aarch64/aarch64.md
===================================================================
--- config/aarch64/aarch64.md	(revision 231206)
+++ config/aarch64/aarch64.md	(working copy)
@@ -104,6 +104,7 @@ (define_c_enum "unspec" [
     UNSPEC_MB
     UNSPEC_NOP
     UNSPEC_PRLG_STK
+    UNSPEC_PROBE_STACK_RANGE
     UNSPEC_RBIT
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
@@ -137,6 +138,7 @@ (define_c_enum "unspecv" [
     UNSPECV_SET_FPCR		; Represent assign of FPCR content.
     UNSPECV_GET_FPSR		; Represent fetch of FPSR content.
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
+    UNSPECV_BLOCKAGE		; Represent a blockage
   ]
 )
 
@@ -4951,6 +4953,29 @@ (define_insn "stack_tie"
   [(set_attr "length" "0")]
 )
 
+;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
+;; all of memory.  This blocks insns from being moved across this point.
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "length" "0")
+   (set_attr "type" "block")]
+)
+
+(define_insn "probe_stack_range_<PTR:mode>"
+  [(set (match_operand:PTR 0 "register_operand" "=r")
+	(unspec_volatile:PTR [(match_operand:PTR 1 "register_operand" "0")
+			      (match_operand:PTR 2 "register_operand" "r")]
+			       UNSPEC_PROBE_STACK_RANGE))]
+  ""
+{
+  return aarch64_output_probe_stack_range (operands[0], operands[2]);
+}
+  [(set_attr "length" "32")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-12-03 12:20               ` Eric Botcazou
@ 2015-12-04  9:39                 ` Marcus Shawcroft
  2015-12-04 11:58                   ` Eric Botcazou
  2015-12-04 13:49                 ` Richard Earnshaw
  1 sibling, 1 reply; 24+ messages in thread
From: Marcus Shawcroft @ 2015-12-04  9:39 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: Richard Earnshaw, gcc-patches, Ramana Radhakrishnan

On 3 December 2015 at 12:17, Eric Botcazou <ebotcazou@adacore.com> wrote:
>> I can understand this restriction, but...
>>
>> > +  /* See the same assertion on PROBE_INTERVAL above.  */
>> > +  gcc_assert ((first % 4096) == 0);
>>
>> ... why isn't this a test that FIRST is aligned to PROBE_INTERVAL?
>
> Because that isn't guaranteed, FIRST is related to the size of the protection
> area while PROBE_INTERVAL is related to the page size.
>
>> blank line between declarations and code. Also, can we come up with a
>> suitable define for 4096 here that expresses the context and then use
>> that consistently through the remainder of this function?
>
> OK, let's use ARITH_BASE.
>
>> > +(define_insn "probe_stack_range"
>> > +  [(set (match_operand:DI 0 "register_operand" "=r")
>> > +   (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
>> > +                        (match_operand:DI 2 "register_operand" "r")]
>> > +                        UNSPEC_PROBE_STACK_RANGE))]
>>
>> I think this should really use PTRmode, so that it's ILP32 ready (I'm
>> not going to ask you to make sure that works though, since I suspect
>> there are still other issues to resolve with ILP32 at this time).
>
> Done.  Manually tested for now, I'll fully test it if approved.

Looks ok to me.  OK /Marcus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-12-04  9:39                 ` Marcus Shawcroft
@ 2015-12-04 11:58                   ` Eric Botcazou
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-12-04 11:58 UTC (permalink / raw)
  To: Marcus Shawcroft; +Cc: gcc-patches, Richard Earnshaw, Ramana Radhakrishnan

> Looks ok to me.  OK /Marcus

Thanks.  Testing was successful so I have installed it with a small change 
(s/ARITH_BASE/ARITH_FACTOR/, it's a bit more mathematically correct).

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-12-03 12:20               ` Eric Botcazou
  2015-12-04  9:39                 ` Marcus Shawcroft
@ 2015-12-04 13:49                 ` Richard Earnshaw
  2015-12-04 18:26                   ` Eric Botcazou
  1 sibling, 1 reply; 24+ messages in thread
From: Richard Earnshaw @ 2015-12-04 13:49 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, Ramana Radhakrishnan

> +	(unspec_volatile:PTR [(match_operand:PTR 1 "register_operand" "0")
> +			      (match_operand:PTR 2 "register_operand" "r")]
> +			       UNSPEC_PROBE_STACK_RANGE))]

Minor nit.  Since this is used in an unspec_volatile, the name should be
UNSPECV_ and defined in the unspecv enum.

Otherwise OK once testing is complete.

R.


On 03/12/15 12:17, Eric Botcazou wrote:
>> I can understand this restriction, but...
>>
>>> +  /* See the same assertion on PROBE_INTERVAL above.  */
>>> +  gcc_assert ((first % 4096) == 0);
>>
>> ... why isn't this a test that FIRST is aligned to PROBE_INTERVAL?
> 
> Because that isn't guaranteed, FIRST is related to the size of the protection 
> area while PROBE_INTERVAL is related to the page size.
> 
>> blank line between declarations and code. Also, can we come up with a
>> suitable define for 4096 here that expresses the context and then use
>> that consistently through the remainder of this function?
> 
> OK, let's use ARITH_BASE.
> 
>>> +(define_insn "probe_stack_range"
>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>> +	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
>>> +			     (match_operand:DI 2 "register_operand" "r")]
>>> +			     UNSPEC_PROBE_STACK_RANGE))]
>>
>> I think this should really use PTRmode, so that it's ILP32 ready (I'm
>> not going to ask you to make sure that works though, since I suspect
>> there are still other issues to resolve with ILP32 at this time).
> 
> Done.  Manually tested for now, I'll fully test it if approved.
> 
> 
>         PR middle-end/65958
>         * config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
>         Declare.
>         * config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
>         UNSPEC_PROBE_STACK_RANGE.
>         (blockage): New instruction.
>         (probe_stack_range_<PTR:mode>): Likewise.
>         * config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
>         function.
>         (aarch64_output_probe_stack_range): Likewise.
>         (aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
>         static builtin stack checking is enabled.
>         * config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
>         Define.
> 
> 
> pr65958-2c.diff
> 
> 
> Index: config/aarch64/aarch64-linux.h
> ===================================================================
> --- config/aarch64/aarch64-linux.h	(revision 231206)
> +++ config/aarch64/aarch64-linux.h	(working copy)
> @@ -88,4 +88,7 @@
>  #undef TARGET_BINDS_LOCAL_P
>  #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
>  
> +/* Define this to be nonzero if static stack checking is supported.  */
> +#define STACK_CHECK_STATIC_BUILTIN 1
> +
>  #endif  /* GCC_AARCH64_LINUX_H */
> Index: config/aarch64/aarch64-protos.h
> ===================================================================
> --- config/aarch64/aarch64-protos.h	(revision 231206)
> +++ config/aarch64/aarch64-protos.h	(working copy)
> @@ -340,6 +340,7 @@ void aarch64_asm_output_labelref (FILE *
>  void aarch64_cpu_cpp_builtins (cpp_reader *);
>  void aarch64_elf_asm_named_section (const char *, unsigned, tree);
>  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
> +const char * aarch64_output_probe_stack_range (rtx, rtx);
>  void aarch64_err_no_fpadvsimd (machine_mode, const char *);
>  void aarch64_expand_epilogue (bool);
>  void aarch64_expand_mov_immediate (rtx, rtx);
> Index: config/aarch64/aarch64.c
> ===================================================================
> --- config/aarch64/aarch64.c	(revision 231206)
> +++ config/aarch64/aarch64.c	(working copy)
> @@ -62,6 +62,7 @@
>  #include "sched-int.h"
>  #include "cortex-a57-fma-steering.h"
>  #include "target-globals.h"
> +#include "common/common-target.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -2183,6 +2184,179 @@ aarch64_libgcc_cmp_return_mode (void)
>    return SImode;
>  }
>  
> +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
> +
> +/* We use the 12-bit shifted immediate arithmetic instructions so values
> +   must be multiple of (1 << 12), i.e. 4096.  */
> +#define ARITH_BASE 4096
> +
> +#if (PROBE_INTERVAL % ARITH_BASE) != 0
> +#error Cannot use simple address calculation for stack probing
> +#endif
> +
> +/* The pair of scratch registers used for stack probing.  */
> +#define PROBE_STACK_FIRST_REG  9
> +#define PROBE_STACK_SECOND_REG 10
> +
> +/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
> +   inclusive.  These are offsets from the current stack pointer.  */
> +
> +static void
> +aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
> +{
> +  rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
> +
> +  /* See the same assertion on PROBE_INTERVAL above.  */
> +  gcc_assert ((first % ARITH_BASE) == 0);
> +
> +  /* See if we have a constant small number of probes to generate.  If so,
> +     that's the easy case.  */
> +  if (size <= PROBE_INTERVAL)
> +    {
> +      const HOST_WIDE_INT base = ROUND_UP (size, ARITH_BASE);
> +
> +      emit_set_insn (reg1,
> +		     plus_constant (ptr_mode,
> +				    stack_pointer_rtx, -(first + base)));
> +      emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
> +    }
> +
> +  /* The run-time loop is made up of 8 insns in the generic case while the
> +     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
> +  else if (size <= 4 * PROBE_INTERVAL)
> +    {
> +      HOST_WIDE_INT i, rem;
> +
> +      emit_set_insn (reg1,
> +		     plus_constant (ptr_mode,
> +				    stack_pointer_rtx,
> +				    -(first + PROBE_INTERVAL)));
> +      emit_stack_probe (reg1);
> +
> +      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
> +	 it exceeds SIZE.  If only two probes are needed, this will not
> +	 generate any code.  Then probe at FIRST + SIZE.  */
> +      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
> +	{
> +	  emit_set_insn (reg1,
> +			 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
> +	  emit_stack_probe (reg1);
> +	}
> +
> +      rem = size - (i - PROBE_INTERVAL);
> +      if (rem > 256)
> +	{
> +	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_BASE);
> +
> +	  emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
> +	  emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
> +	}
> +      else
> +	emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
> +    }
> +
> +  /* Otherwise, do the same as above, but in a loop.  Note that we must be
> +     extra careful with variables wrapping around because we might be at
> +     the very top (or the very bottom) of the address space and we have
> +     to be able to handle this case properly; in particular, we use an
> +     equality test for the loop condition.  */
> +  else
> +    {
> +      rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
> +
> +      /* Step 1: round SIZE to the previous multiple of the interval.  */
> +
> +      HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
> +
> +
> +      /* Step 2: compute initial and final value of the loop counter.  */
> +
> +      /* TEST_ADDR = SP + FIRST.  */
> +      emit_set_insn (reg1,
> +		     plus_constant (ptr_mode, stack_pointer_rtx, -first));
> +
> +      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
> +      emit_set_insn (reg2,
> +		     plus_constant (ptr_mode, stack_pointer_rtx,
> +				    -(first + rounded_size)));
> +
> +
> +      /* Step 3: the loop
> +
> +	 do
> +	   {
> +	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
> +	     probe at TEST_ADDR
> +	   }
> +	 while (TEST_ADDR != LAST_ADDR)
> +
> +	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
> +	 until it is equal to ROUNDED_SIZE.  */
> +
> +      if (ptr_mode == DImode)
> +	emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
> +      else
> +	emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
> +
> +
> +      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
> +	 that SIZE is equal to ROUNDED_SIZE.  */
> +
> +      if (size != rounded_size)
> +	{
> +	  HOST_WIDE_INT rem = size - rounded_size;
> +
> +	  if (rem > 256)
> +	    {
> +	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_BASE);
> +
> +	      emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
> +	      emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
> +	    }
> +	  else
> +	    emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
> +	}
> +    }
> +
> +  /* Make sure nothing is scheduled before we are done.  */
> +  emit_insn (gen_blockage ());
> +}
> +
> +/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
> +   absolute addresses.  */
> +
> +const char *
> +aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> +{
> +  static int labelno = 0;
> +  char loop_lab[32];
> +  rtx xops[2];
> +
> +  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
> +
> +  /* Loop.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
> +
> +  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
> +  xops[0] = reg1;
> +  xops[1] = GEN_INT (PROBE_INTERVAL);
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Probe at TEST_ADDR.  */
> +  output_asm_insn ("str\txzr, [%0]", xops);
> +
> +  /* Test if TEST_ADDR == LAST_ADDR.  */
> +  xops[1] = reg2;
> +  output_asm_insn ("cmp\t%0, %1", xops);
> +
> +  /* Branch.  */
> +  fputs ("\tb.ne\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  return "";
> +}
> +
>  static bool
>  aarch64_frame_pointer_required (void)
>  {
> @@ -2583,6 +2757,18 @@ aarch64_expand_prologue (void)
>    if (flag_stack_usage_info)
>      current_function_static_stack_size = frame_size;
>  
> +  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
> +    {
> +      if (crtl->is_leaf && !cfun->calls_alloca)
> +	{
> +	  if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
> +	    aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
> +					    frame_size - STACK_CHECK_PROTECT);
> +	}
> +      else if (frame_size > 0)
> +	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
> +    }
> +
>    /* Store pairs and load pairs have a range only -512 to 504.  */
>    if (offset >= 512)
>      {
> Index: config/aarch64/aarch64.md
> ===================================================================
> --- config/aarch64/aarch64.md	(revision 231206)
> +++ config/aarch64/aarch64.md	(working copy)
> @@ -104,6 +104,7 @@ (define_c_enum "unspec" [
>      UNSPEC_MB
>      UNSPEC_NOP
>      UNSPEC_PRLG_STK
> +    UNSPEC_PROBE_STACK_RANGE
>      UNSPEC_RBIT
>      UNSPEC_SISD_NEG
>      UNSPEC_SISD_SSHL
> @@ -137,6 +138,7 @@ (define_c_enum "unspecv" [
>      UNSPECV_SET_FPCR		; Represent assign of FPCR content.
>      UNSPECV_GET_FPSR		; Represent fetch of FPSR content.
>      UNSPECV_SET_FPSR		; Represent assign of FPSR content.
> +    UNSPECV_BLOCKAGE		; Represent a blockage
>    ]
>  )
>  
> @@ -4951,6 +4953,29 @@ (define_insn "stack_tie"
>    [(set_attr "length" "0")]
>  )
>  
> +;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
> +;; all of memory.  This blocks insns from being moved across this point.
> +
> +(define_insn "blockage"
> +  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
> +  ""
> +  ""
> +  [(set_attr "length" "0")
> +   (set_attr "type" "block")]
> +)
> +
> +(define_insn "probe_stack_range_<PTR:mode>"
> +  [(set (match_operand:PTR 0 "register_operand" "=r")
> +	(unspec_volatile:PTR [(match_operand:PTR 1 "register_operand" "0")
> +			      (match_operand:PTR 2 "register_operand" "r")]
> +			       UNSPEC_PROBE_STACK_RANGE))]
> +  ""
> +{
> +  return aarch64_output_probe_stack_range (operands[0], operands[2]);
> +}
> +  [(set_attr "length" "32")]
> +)
> +
>  ;; Named pattern for expanding thread pointer reference.
>  (define_expand "get_thread_pointerdi"
>    [(match_operand:DI 0 "register_operand" "=r")]
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [ARM] Fix PR middle-end/65958
  2015-12-04 13:49                 ` Richard Earnshaw
@ 2015-12-04 18:26                   ` Eric Botcazou
  0 siblings, 0 replies; 24+ messages in thread
From: Eric Botcazou @ 2015-12-04 18:26 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 626 bytes --]

> Minor nit.  Since this is used in an unspec_volatile, the name should be
> UNSPECV_ and defined in the unspecv enum.

Ouch.  It turns out that the ARM implementation has it too so I have installed 
the attached patchlet on top of the others after minimal retesting.


	PR middle-end/65958
	* config/arm/unspecs.md (unspec): Remove UNSPEC_PROBE_STACK_RANGE.
	(unspecv): Add VUNSPEC_PROBE_STACK_RANGE.
	* config/arm/arm.md (probe_stack_range): Adjust.
	* config/aarch64/aarch64.md (unspec): Remove UNSPEC_PROBE_STACK_RANGE.
	(unspecv): Add UNSPECV_PROBE_STACK_RANGE.
	(probe_stack_range_<PTR:mode>): Adjust.

-- 
Eric Botcazou

[-- Attachment #2: pr65958-3.diff --]
[-- Type: text/x-patch, Size: 2445 bytes --]

Index: config/arm/arm.md
===================================================================
--- config/arm/arm.md	(revision 231250)
+++ config/arm/arm.md	(working copy)
@@ -8183,7 +8183,7 @@ (define_insn "probe_stack_range"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec_volatile:SI [(match_operand:SI 1 "register_operand" "0")
 			     (match_operand:SI 2 "register_operand" "r")]
-			     UNSPEC_PROBE_STACK_RANGE))]
+			     VUNSPEC_PROBE_STACK_RANGE))]
   "TARGET_32BIT"
 {
   return output_probe_stack_range (operands[0], operands[2]);
Index: config/arm/unspecs.md
===================================================================
--- config/arm/unspecs.md	(revision 231250)
+++ config/arm/unspecs.md	(working copy)
@@ -84,7 +84,6 @@ (define_c_enum "unspec" [
   UNSPEC_VRINTA         ; Represent a float to integral float rounding
                         ; towards nearest, ties away from zero.
   UNSPEC_PROBE_STACK    ; Probe stack memory reference
-  UNSPEC_PROBE_STACK_RANGE ; Probe stack range
 ])
 
 (define_c_enum "unspec" [
@@ -147,6 +146,7 @@ (define_c_enum "unspecv" [
   VUNSPEC_STL		; Represent a store-register-release.
   VUNSPEC_GET_FPSCR	; Represent fetch of FPSCR content.
   VUNSPEC_SET_FPSCR	; Represent assign of FPSCR content.
+  VUNSPEC_PROBE_STACK_RANGE ; Represent stack range probing.
 ])
 
 ;; Enumerators for NEON unspecs.
Index: config/aarch64/aarch64.md
===================================================================
--- config/aarch64/aarch64.md	(revision 231259)
+++ config/aarch64/aarch64.md	(working copy)
@@ -104,7 +104,6 @@ (define_c_enum "unspec" [
     UNSPEC_MB
     UNSPEC_NOP
     UNSPEC_PRLG_STK
-    UNSPEC_PROBE_STACK_RANGE
     UNSPEC_RBIT
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
@@ -139,6 +138,7 @@ (define_c_enum "unspecv" [
     UNSPECV_GET_FPSR		; Represent fetch of FPSR content.
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
     UNSPECV_BLOCKAGE		; Represent a blockage
+    UNSPECV_PROBE_STACK_RANGE	; Represent stack range probing.
   ]
 )
 
@@ -4968,7 +4968,7 @@ (define_insn "probe_stack_range_<PTR:mod
   [(set (match_operand:PTR 0 "register_operand" "=r")
 	(unspec_volatile:PTR [(match_operand:PTR 1 "register_operand" "0")
 			      (match_operand:PTR 2 "register_operand" "r")]
-			       UNSPEC_PROBE_STACK_RANGE))]
+			       UNSPECV_PROBE_STACK_RANGE))]
   ""
 {
   return aarch64_output_probe_stack_range (operands[0], operands[2]);

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2015-12-04 18:26 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-11 16:15 [ARM] Fix PR middle-end/65958 Eric Botcazou
2015-06-17 10:41 ` Ramana Radhakrishnan
2015-06-18 19:03   ` Eric Botcazou
2015-07-06 15:46     ` Ramana Radhakrishnan
2015-09-20 21:05       ` Christophe Lyon
2015-09-21  8:18         ` Eric Botcazou
2015-10-06 10:11       ` Eric Botcazou
2015-10-06 13:43         ` Ramana Radhakrishnan
2015-10-28 11:38           ` Eric Botcazou
2015-10-07  8:15         ` Yao Qi
2015-10-07  9:10           ` Eric Botcazou
2015-10-07 10:42             ` Yao Qi
2015-10-07 17:38               ` Eric Botcazou
2015-11-03 17:35         ` Richard Earnshaw
2015-11-03 18:05           ` Eric Botcazou
2015-11-03 21:51             ` Eric Botcazou
2015-11-16 20:01           ` Eric Botcazou
2015-11-25  7:56             ` Eric Botcazou
2015-12-03 11:08             ` Richard Earnshaw
2015-12-03 12:20               ` Eric Botcazou
2015-12-04  9:39                 ` Marcus Shawcroft
2015-12-04 11:58                   ` Eric Botcazou
2015-12-04 13:49                 ` Richard Earnshaw
2015-12-04 18:26                   ` Eric Botcazou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).