public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/vendors/ARM/heads/CVE-2023-4039/gcc-10)] aarch64: Explicitly record probe registers in frame info
@ 2023-09-12 15:25 Richard Sandiford
  0 siblings, 0 replies; only message in thread
From: Richard Sandiford @ 2023-09-12 15:25 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:3c73e7bc86a918c2c24b3ad96b2d44716a5e2614

commit 3c73e7bc86a918c2c24b3ad96b2d44716a5e2614
Author: Richard Sandiford <richard.sandiford@arm.com>
Date:   Tue Jun 27 11:50:34 2023 +0100

    aarch64: Explicitly record probe registers in frame info
    
    The stack frame is currently divided into three areas:
    
    A: the area above the hard frame pointer
    B: the SVE saves below the hard frame pointer
    C: the outgoing arguments
    
    If the stack frame is allocated in one chunk, the allocation needs a
    probe if the frame size is >= guard_size - 1KiB.  In addition, if the
    function is not a leaf function, it must probe an address no more than
    1KiB above the outgoing SP.  We ensured the second condition by
    
    (1) using single-chunk allocations for non-leaf functions only if
        the link register save slot is within 512 bytes of the bottom
        of the frame; and
    
    (2) using the link register save as a probe (meaning, for instance,
        that it can't be individually shrink wrapped)
    
    If instead the stack is allocated in multiple chunks, then:
    
    * an allocation involving only the outgoing arguments (C above) requires
      a probe if the allocation size is > 1KiB
    
    * any other allocation requires a probe if the allocation size
      is >= guard_size - 1KiB
    
    * second and subsequent allocations require the previous allocation
      to probe at the bottom of the allocated area, regardless of the size
      of that previous allocation
    
    The final point means that, unlike for single allocations,
    it can be necessary to have both a non-SVE register probe and
    an SVE register probe.  For example:
    
    * allocate A, probe using a non-SVE register save
    * allocate B, probe using an SVE register save
    * allocate C
    
    The non-SVE register used in this case was again the link register.
    It was previously used even if the link register save slot was some
    bytes above the bottom of the non-SVE register saves, but an earlier
    patch avoided that by putting the link register save slot first.
    
    As a belt-and-braces fix, this patch explicitly records which
    probe registers we're using and allows the non-SVE probe to be
    whichever register comes first (as for SVE).
    
    The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
    
    gcc/
            * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
            (aarch64_frame::hard_fp_save_and_probe): New fields.
            * config/aarch64/aarch64.c (aarch64_layout_frame): Initialize them.
            Rather than asserting that a leaf function saves LR, instead assert
            that a leaf function saves something.
            (aarch64_get_separate_components): Prevent the chosen probe
            registers from being individually shrink-wrapped.
            (aarch64_allocate_and_probe_stack_space): Remove workaround for
            probe registers that aren't at the bottom of the previous allocation.
    
    gcc/testsuite/
            * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.

Diff:
---
 gcc/config/aarch64/aarch64.c                       | 67 +++++++++++++++++-----
 gcc/config/aarch64/aarch64.h                       |  8 +++
 .../gcc.target/aarch64/sve/pcs/stack_clash_3.c     |  6 +-
 3 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 37c6219b07ab..2dea9b5617d3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6897,13 +6897,10 @@ aarch64_layout_frame (void)
 	&& !crtl->abi->clobbers_full_reg_p (regno))
       frame.reg_offset[regno] = SLOT_REQUIRED;
 
-  /* With stack-clash, LR must be saved in non-leaf functions.  */
-  gcc_assert (crtl->is_leaf
-	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
-
   poly_int64 offset = crtl->outgoing_args_size;
   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
   frame.bytes_below_saved_regs = offset;
+  frame.sve_save_and_probe = INVALID_REGNUM;
 
   /* Now assign stack slots for the registers.  Start with the predicate
      registers, since predicate LDR and STR have a relatively small
@@ -6911,6 +6908,8 @@ aarch64_layout_frame (void)
   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
       {
+	if (frame.sve_save_and_probe == INVALID_REGNUM)
+	  frame.sve_save_and_probe = regno;
 	frame.reg_offset[regno] = offset;
 	offset += BYTES_PER_SVE_PRED;
       }
@@ -6948,6 +6947,8 @@ aarch64_layout_frame (void)
     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
 	{
+	  if (frame.sve_save_and_probe == INVALID_REGNUM)
+	    frame.sve_save_and_probe = regno;
 	  frame.reg_offset[regno] = offset;
 	  offset += vector_save_size;
 	}
@@ -6957,11 +6958,19 @@ aarch64_layout_frame (void)
   frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
   bool saves_below_hard_fp_p
     = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
+  gcc_assert (!saves_below_hard_fp_p
+	      || (frame.sve_save_and_probe != INVALID_REGNUM
+		  && known_eq (frame.reg_offset[frame.sve_save_and_probe],
+			       frame.bytes_below_saved_regs)));
+
   frame.bytes_below_hard_fp = offset;
+  frame.hard_fp_save_and_probe = INVALID_REGNUM;
 
 #define ALLOCATE_GPR_SLOT(REGNO)				\
   do								\
     {								\
+      if (frame.hard_fp_save_and_probe == INVALID_REGNUM)	\
+	frame.hard_fp_save_and_probe = (REGNO);			\
       frame.reg_offset[REGNO] = offset;				\
       if (frame.wb_candidate1 == INVALID_REGNUM)		\
 	frame.wb_candidate1 = (REGNO);				\
@@ -6998,6 +7007,8 @@ aarch64_layout_frame (void)
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
       {
+	if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
+	  frame.hard_fp_save_and_probe = regno;
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
 	if (regno == last_fp_reg
@@ -7021,6 +7032,17 @@ aarch64_layout_frame (void)
   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
 
   frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
+  gcc_assert (known_eq (frame.saved_regs_size,
+			frame.below_hard_fp_saved_regs_size)
+	      || (frame.hard_fp_save_and_probe != INVALID_REGNUM
+		  && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
+			       frame.bytes_below_hard_fp)));
+
+  /* With stack-clash, a register must be saved in non-leaf functions.
+     The saving of the bottommost register counts as an implicit probe,
+     which allows us to maintain the invariant described in the comment
+     at expand_prologue.  */
+  gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
 
   offset += get_frame_size ();
   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -7120,6 +7142,25 @@ aarch64_layout_frame (void)
       frame.final_adjust = frame.bytes_below_saved_regs;
     }
 
+  /* The frame is allocated in pieces, with each non-final piece
+     including a register save at offset 0 that acts as a probe for
+     the following piece.  In addition, the save of the bottommost register
+     acts as a probe for callees and allocas.  Roll back any probes that
+     aren't needed.
+
+     A probe isn't needed if it is associated with the final allocation
+     (including callees and allocas) that happens before the epilogue is
+     executed.  */
+  if (crtl->is_leaf
+      && !cfun->calls_alloca
+      && known_eq (frame.final_adjust, 0))
+    {
+      if (maybe_ne (frame.sve_callee_adjust, 0))
+	frame.sve_save_and_probe = INVALID_REGNUM;
+      else
+	frame.hard_fp_save_and_probe = INVALID_REGNUM;
+    }
+
   /* Make sure the individual adjustments add up to the full frame size.  */
   gcc_assert (known_eq (frame.initial_adjust
 			+ frame.callee_adjust
@@ -7669,13 +7710,6 @@ aarch64_get_separate_components (void)
 
 	poly_int64 offset = frame.reg_offset[regno];
 
-	/* If the register is saved in the first SVE save slot, we use
-	   it as a stack probe for -fstack-clash-protection.  */
-	if (flag_stack_clash_protection
-	    && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
-	    && known_eq (offset, frame.bytes_below_saved_regs))
-	  continue;
-
 	/* Get the offset relative to the register we'll use.  */
 	if (frame_pointer_needed)
 	  offset -= frame.bytes_below_hard_fp;
@@ -7710,6 +7744,13 @@ aarch64_get_separate_components (void)
 
   bitmap_clear_bit (components, LR_REGNUM);
   bitmap_clear_bit (components, SP_REGNUM);
+  if (flag_stack_clash_protection)
+    {
+      if (frame.sve_save_and_probe != INVALID_REGNUM)
+	bitmap_clear_bit (components, frame.sve_save_and_probe);
+      if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
+	bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
+    }
 
   return components;
 }
@@ -8246,8 +8287,8 @@ aarch64_epilogue_uses (int regno)
    When probing is needed, we emit a probe at the start of the prologue
    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
 
-   We have to track how much space has been allocated and the only stores
-   to the stack we track as implicit probes are the FP/LR stores.
+   We can also use register saves as probes.  These are stored in
+   sve_save_and_probe and hard_fp_save_and_probe.
 
    For outgoing arguments we probe if the size is larger than 1KB, such that
    the ABI specified buffer is maintained for the next callee.
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 1c3c62639cbf..5279503fd090 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -877,6 +877,14 @@ struct GTY (()) aarch64_frame
      This is the register they should use.  */
   unsigned spare_pred_reg;
 
+  /* An SVE register that is saved below the hard frame pointer and that acts
+     as a probe for later allocations, or INVALID_REGNUM if none.  */
+  unsigned sve_save_and_probe;
+
+  /* A register that is saved at the hard frame pointer and that acts
+     as a probe for later allocations, or INVALID_REGNUM if none.  */
+  unsigned hard_fp_save_and_probe;
+
   bool laid_out;
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
index 3e01ec36c3a4..3530a0d504ba 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
@@ -11,11 +11,10 @@
 **	mov	x11, sp
 **	...
 **	sub	sp, sp, x13
-**	str	p4, \[sp\]
 **	cbz	w0, [^\n]*
+**	str	p4, \[sp\]
 **	...
 **	ptrue	p0\.b, all
-**	ldr	p4, \[sp\]
 **	addvl	sp, sp, #1
 **	ldr	x24, \[sp\], 32
 **	ret
@@ -39,13 +38,12 @@ test_1 (int n)
 **	mov	x11, sp
 **	...
 **	sub	sp, sp, x13
-**	str	p4, \[sp\]
 **	cbz	w0, [^\n]*
+**	str	p4, \[sp\]
 **	str	p5, \[sp, #1, mul vl\]
 **	str	p6, \[sp, #2, mul vl\]
 **	...
 **	ptrue	p0\.b, all
-**	ldr	p4, \[sp\]
 **	addvl	sp, sp, #1
 **	ldr	x24, \[sp\], 32
 **	ret

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-09-12 15:25 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-12 15:25 [gcc(refs/vendors/ARM/heads/CVE-2023-4039/gcc-10)] aarch64: Explicitly record probe registers in frame info Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).