Hi Richard,

Here's the updated patch and some comments inline below.

An example sequence is:

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
.SVLPSRL0:
        cmp     x16, 65536
        b.lt    .BRRCHK0
        sub     sp, sp, 65536
        str     xzr, [sp, 1024]
        sub     x16, x16, 65536
        b       .SVLPSRL0
.BRRCHK0:
        sub     sp, sp, x16
        cmp     x16, 2048
        b.lt    .BERCHK0
        str     xzr, [sp, 1024]
.BERCHK0:
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
        stp     x29, x30, [sp]

Ok for trunk?

Thanks,
Tamar

gcc/
2018-09-07  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash): New.
	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/
2018-09-07  Tamar Christina  <tamar.christina@arm.com>

	PR target/86486
	* gcc.target/aarch64/stack-check-prologue-16.c: New test
	* gcc.target/aarch64/stack-check-cfa-3.c: New test.

The 08/28/2018 21:40, Richard Sandiford wrote:
> I'll leave the AArch64 maintainers to review, but some comments.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> >    return "";
> >  }
> > +
> > +  /* Test if BASE < LIMIT.  */
> > +  xops[1] = limit;
> > +  output_asm_insn ("cmp\t%0, %1", xops);
> 
> Think this should be ADJUSTMENT < LIMIT.

Actually it should be 2KB in this case. I've explained why in the updated patch.

> 
> > +  /* Branch to end.  */
> > +  fputs ("\tb.lt\t", asm_out_file);
> > +  assemble_name_raw (asm_out_file, loop_end_lab);
> > +  fputc ('\n', asm_out_file);
> > +
> > +  /* Probe at BASE + LIMIT.  */
> > +  output_asm_insn ("str\txzr, [%0, %1]", xops);
> 
> It looks like this would probe at LIMIT when ADJUSTMENT is exactly LIMIT,
> which could clobber the caller's frame.
> 

Yeah, the comparison should have been a bit larger. Thanks.

> > +
> > +  /* No probe leave.  */
> > +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> > +  return "";
> 
> With the CFA stuff and constant load, I think this works out as:
> 
> ---------------------------------------------
> # 12 insns
> 	mov	r15, base
> 	mov	adjustment, N
> 1:
> 	cmp	adjustment, guard_size
> 	b.lt    2f
> 	sub	base, base, guard_size
> 	str	xzr, [base, limit]
> 	sub     adjustment, adjustment, guard_size
> 	b	1b
> 2:
> 	sub     base, base, adjustment
> 	cmp	adjustment, limit
> 	b.le	3f
> 	str	xzr, [base, limit]
> 3:
> ---------------------------------------------
> 
> What do you think about something like:
> 
> ---------------------------------------------
> # 10 insns
> 	mov	adjustment, N
> 	sub	r15, base, adjustment
> 	subs	adjustment, adjustment, min_probe_threshold
> 	b.lo	2f
> 1:
> 	add	base, x15, adjustment
> 	str	xzr, [base, 0]
> 	subs	adjustment, adjustment, 16
> 	and	adjustment, adjustment, ~(guard_size-1)
> 	b.hs	1b
> 2:
> 	mov	base, r15
> ---------------------------------------------
> 
> or (with different trade-offs):
> 
> ---------------------------------------------
> # 11 insns
> 	mov	adjustment, N
> 	sub	r15, base, adjustment
> 	subs	adjustment, adjustment, min_probe_threshold
> 	b.lo	2f
> 	# Might be 0, leading to a double probe
> 	and	r14, adjustment, guard_size-1
> 1:
> 	add	base, x15, adjustment
> 	str	xzr, [base, 0]
> 	subs	adjustment, adjustment, r14
> 	mov	r14, guard_size
> 	b.hs	1b
> 2:
> 	mov	base, r15
> ---------------------------------------------
> 
> or (longer, but with a simpler loop):
> 
> ---------------------------------------------
> # 12 insns
> 	mov	adjustment, N
> 	sub	r15, base, adjustment
> 	subs	adjustment, adjustment, min_probe_threshold
> 	b.lo	2f
> 	str	xzr, [base, -16]!
> 	sub	adjustment, adjustment, 32
> 	and	adjustment, adjustment, -(guard_size-1)
> 1:
> 	add	base, x15, adjustment
> 	str	xzr, [base, 0]
> 	subs	adjustment, adjustment, guard_size
> 	b.hs	1b
> 2:
> 	mov	base, r15
> ---------------------------------------------
> 
> with the CFA based on r15+offset?
> 
> These loops probe more often than necessary in some cases,
> but they only need a single branch in the common case that
> ADJUSTMENT <= MIN_PROBE_THRESHOLD.

I haven't changed the loop yet because I'm a bit on the edge about
whether the implementation difficulties would outweigh the benefits.
We are planning on doing something smarter for SVE so optimizing these
loops only to replace them later may not be time well spent now.

The problem is that to support both 4KB and 64KB pages, instructions such
as subs would require different immediates and shifts. Granted we technically
only support these two so I could hardcode the values, but that would mean
these functions are less general than the rest.

If you think it would be worthwhile, I'd be happy to use one of these loops instead.

> 
> > @@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >  	}
> >      }
> > 
> > -  HOST_WIDE_INT size;
> > +  /* GCC's initialization analysis is broken so initialize size.  */
> > +  HOST_WIDE_INT size = 0;
> 
> It's not broken in this case. :-)  is_constant only modifies its argument
> when returning true.  In other cases the variable keeps whatever value
> it had originally.  And the code does use "size" when !is_constant,
> so an explicit initialisation is necessary.

ah, ok. Thanks!

> 
> >    /* If SIZE is not large enough to require probing, just adjust the stack and
> >       exit.  */
> > -  if (!poly_size.is_constant (&size)
> > -      || known_lt (poly_size, min_probe_threshold)
> > +  if ((poly_size.is_constant (&size)
> > +       && known_lt (poly_size, min_probe_threshold))
> >        || !flag_stack_clash_protection)
> 
> No need for the is_constant here, just known_lt is enough.
>

The is_constant is used to extract the size value safely.
 
> >      {
> >        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
> >        return;
> >      }
> > 
> > -  if (dump_file)
> > +  if (dump_file && poly_size.is_constant ())
> >      fprintf (dump_file,
> >  	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> >  	     ", probing will be required.\n", size);
> > 
> > +  if (dump_file && !poly_size.is_constant ())
> > +    {
> > +      fprintf (dump_file, "Stack clash SVE prologue: ");
> > +      dump_dec (MSG_NOTE, poly_size);
> 
> This should be print_dec (poly_size, dump_file);
> 

done.

> > +      fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > +    }
> > +
> >    /* Round size to the nearest multiple of guard_size, and calculate the
> >       residual as the difference between the original size and the rounded
> >       size.  */
> > @@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> > 
> >    /* We can handle a small number of allocations/probes inline.  Otherwise
> >       punt to a loop.  */
> > -  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> > +  if (poly_size.is_constant ()
> > +      && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> >      {
> >        for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
> >  	{
> > @@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >  	}
> >        dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
> >      }
> > -  else
> > +  else if (poly_size.is_constant ())
> >      {
> >        /* Compute the ending address.  */
> >        aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
> > @@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> >        emit_insn (gen_blockage ());
> >        dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
> >      }
> > +  else
> > +    {
> 
> It would probably be better to handle "!poly_size.is_constant ()"
> after the "!flag_stack_clash_protection" if statement and exit early,
> so that we don't do calculations based on "size" when "size" has a
> fairly meaningless value.  It would also avoid repeated checks for
> is_constant.
> 

done

> > +      rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
> > +      rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);
> 
> CONST_INTs don't have a recorded mode, so this should either be GEN_INT or
> (better) gen_int_mode.
>

done.

> Thanks,
> Richard

--