[AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
       [not found] ` <b653851c-9587-a498-a8da-8d235b4ddbcc@foss.arm.com>
@ 2016-07-20 13:03   ` Jiong Wang
  2016-07-20 14:19     ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 9+ messages in thread
From: Jiong Wang @ 2016-07-20 13:03 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1013 bytes --]

Those stack adjustment sequences inside aarch64_expand_prologue/epilogue
are doing exactly what's aarch64_add_constant offered, except they also
need to be aware of dwarf generation.

This patch teach existed aarch64_add_constant about dwarf generation and
currently SP register is supported.  Whenever SP is updated, there
should be CFA update, we then mark these instructions as frame related,
and if the update is too complex for gcc to guess the adjustment, we
attach explicit annotation.

Both dwarf frame info size and pro/epilogue scheduling are improved after
this patch as aarch64_add_constant has better utilization of scratch register.

OK for trunk?

gcc/
2016-07-20  Jiong Wang  <jiong.wang@arm.com>

             * config/aarch64/aarch64.c (aarch64_add_constant): Mark
             instruction as frame related when it is.  Generate CFA
             annotation when it's necessary.
             (aarch64_expand_prologue): Use aarch64_add_constant.
             (aarch64_expand_epilogue): Likewise.


[-- Attachment #2: build-const-3.patch --]
[-- Type: text/x-patch, Size: 4261 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 41844a101247c939ecb31f8a8c17cf79759255aa..b38f3f1e8f85a5f3191d0c96080327dac7b2eaed 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1874,6 +1874,8 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
 {
   HOST_WIDE_INT mdelta = abs_hwi (delta);
   rtx this_rtx = gen_rtx_REG (mode, regnum);
+  bool frame_related_p = (regnum == SP_REGNUM);
+  rtx_insn *insn;
 
   /* Do nothing if mdelta is zero.  */
   if (!mdelta)
@@ -1882,7 +1884,8 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
   /* We only need single instruction if the offset fit into add/sub.  */
   if (aarch64_uimm12_shift (mdelta))
     {
-      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
       return;
     }
 
@@ -1895,15 +1898,23 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
       HOST_WIDE_INT low_off = mdelta & 0xfff;
 
       low_off = delta < 0 ? -low_off : low_off;
-      emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
-      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
       return;
     }
 
   /* Otherwise use generic function to handle all other situations.  */
   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
   aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
-  emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
+  insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
+  if (frame_related_p)
+    {
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      rtx adj = plus_constant (mode, this_rtx, delta);
+      add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
+    }
 }
 
 static bool
@@ -3038,36 +3049,7 @@ aarch64_expand_prologue (void)
       frame_size -= (offset + crtl->outgoing_args_size);
       fp_offset = 0;
 
-      if (frame_size >= 0x1000000)
-	{
-	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-	  emit_move_insn (op0, GEN_INT (-frame_size));
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-
-	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
-			gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    -frame_size)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
-      else if (frame_size > 0)
-	{
-	  int hi_ofs = frame_size & 0xfff000;
-	  int lo_ofs = frame_size & 0x000fff;
-
-	  if (hi_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	  if (lo_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	}
+      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size);
     }
   else
     frame_size = -1;
@@ -3287,31 +3269,7 @@ aarch64_expand_epilogue (bool for_sibcall)
       if (need_barrier_p)
 	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
 
-      if (frame_size >= 0x1000000)
-	{
-	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-	  emit_move_insn (op0, GEN_INT (frame_size));
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-	}
-      else
-	{
-          int hi_ofs = frame_size & 0xfff000;
-          int lo_ofs = frame_size & 0x000fff;
-
-	  if (hi_ofs && lo_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (hi_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      frame_size = lo_ofs;
-	    }
-	  insn = emit_insn (gen_add2_insn
-			    (stack_pointer_rtx, GEN_INT (frame_size)));
-	}
-
-      /* Reset the CFA to be SP + 0.  */
-      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
-      RTX_FRAME_RELATED_P (insn) = 1;
+      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size);
     }
 
   /* Stack adjustment for exception handler.  */

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
  2016-07-20 13:03   ` [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant Jiong Wang
@ 2016-07-20 14:19     ` Richard Earnshaw (lists)
  2016-07-20 15:02       ` Jiong Wang
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Earnshaw (lists) @ 2016-07-20 14:19 UTC (permalink / raw)
  To: Jiong Wang, GCC Patches

On 20/07/16 14:03, Jiong Wang wrote:
> Those stack adjustment sequences inside aarch64_expand_prologue/epilogue
> are doing exactly what's aarch64_add_constant offered, except they also
> need to be aware of dwarf generation.
> 
> This patch teach existed aarch64_add_constant about dwarf generation and
> currently SP register is supported.  Whenever SP is updated, there
> should be CFA update, we then mark these instructions as frame related,
> and if the update is too complex for gcc to guess the adjustment, we
> attach explicit annotation.
> 
> Both dwarf frame info size and pro/epilogue scheduling are improved after
> this patch as aarch64_add_constant has better utilization of scratch
> register.
> 
> OK for trunk?
> 
> gcc/
> 2016-07-20  Jiong Wang  <jiong.wang@arm.com>
> 
>             * config/aarch64/aarch64.c (aarch64_add_constant): Mark
>             instruction as frame related when it is.  Generate CFA
>             annotation when it's necessary.
>             (aarch64_expand_prologue): Use aarch64_add_constant.
>             (aarch64_expand_epilogue): Likewise.
> 

Are you sure using aarch64_add_constant is unconditionally safe?  Stack
adjustments need to be done very carefully to ensure that we never
transiently deallocate part of the stack.

R.


> 
> build-const-3.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 41844a101247c939ecb31f8a8c17cf79759255aa..b38f3f1e8f85a5f3191d0c96080327dac7b2eaed 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1874,6 +1874,8 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
>  {
>    HOST_WIDE_INT mdelta = abs_hwi (delta);
>    rtx this_rtx = gen_rtx_REG (mode, regnum);
> +  bool frame_related_p = (regnum == SP_REGNUM);
> +  rtx_insn *insn;
>  
>    /* Do nothing if mdelta is zero.  */
>    if (!mdelta)
> @@ -1882,7 +1884,8 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
>    /* We only need single instruction if the offset fit into add/sub.  */
>    if (aarch64_uimm12_shift (mdelta))
>      {
> -      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
> +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
>        return;
>      }
>  
> @@ -1895,15 +1898,23 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
>        HOST_WIDE_INT low_off = mdelta & 0xfff;
>  
>        low_off = delta < 0 ? -low_off : low_off;
> -      emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
> -      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
> +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
> +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
>        return;
>      }
>  
>    /* Otherwise use generic function to handle all other situations.  */
>    rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
>    aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
> -  emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
> +  insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
> +  if (frame_related_p)
> +    {
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
> +      rtx adj = plus_constant (mode, this_rtx, delta);
> +      add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
> +    }
>  }
>  
>  static bool
> @@ -3038,36 +3049,7 @@ aarch64_expand_prologue (void)
>        frame_size -= (offset + crtl->outgoing_args_size);
>        fp_offset = 0;
>  
> -      if (frame_size >= 0x1000000)
> -	{
> -	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
> -	  emit_move_insn (op0, GEN_INT (-frame_size));
> -	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
> -
> -	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
> -			gen_rtx_SET (stack_pointer_rtx,
> -				     plus_constant (Pmode, stack_pointer_rtx,
> -						    -frame_size)));
> -	  RTX_FRAME_RELATED_P (insn) = 1;
> -	}
> -      else if (frame_size > 0)
> -	{
> -	  int hi_ofs = frame_size & 0xfff000;
> -	  int lo_ofs = frame_size & 0x000fff;
> -
> -	  if (hi_ofs)
> -	    {
> -	      insn = emit_insn (gen_add2_insn
> -				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	    }
> -	  if (lo_ofs)
> -	    {
> -	      insn = emit_insn (gen_add2_insn
> -				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	    }
> -	}
> +      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size);
>      }
>    else
>      frame_size = -1;
> @@ -3287,31 +3269,7 @@ aarch64_expand_epilogue (bool for_sibcall)
>        if (need_barrier_p)
>  	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
>  
> -      if (frame_size >= 0x1000000)
> -	{
> -	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
> -	  emit_move_insn (op0, GEN_INT (frame_size));
> -	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
> -	}
> -      else
> -	{
> -          int hi_ofs = frame_size & 0xfff000;
> -          int lo_ofs = frame_size & 0x000fff;
> -
> -	  if (hi_ofs && lo_ofs)
> -	    {
> -	      insn = emit_insn (gen_add2_insn
> -				(stack_pointer_rtx, GEN_INT (hi_ofs)));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	      frame_size = lo_ofs;
> -	    }
> -	  insn = emit_insn (gen_add2_insn
> -			    (stack_pointer_rtx, GEN_INT (frame_size)));
> -	}
> -
> -      /* Reset the CFA to be SP + 0.  */
> -      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
> -      RTX_FRAME_RELATED_P (insn) = 1;
> +      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size);
>      }
>  
>    /* Stack adjustment for exception handler.  */
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
  2016-07-20 14:19     ` Richard Earnshaw (lists)
@ 2016-07-20 15:02       ` Jiong Wang
  2016-07-20 15:09         ` Richard Earnshaw (lists)
  2016-07-21 10:08         ` Richard Earnshaw (lists)
  0 siblings, 2 replies; 9+ messages in thread
From: Jiong Wang @ 2016-07-20 15:02 UTC (permalink / raw)
  To: Richard Earnshaw (lists); +Cc: GCC Patches

On 20/07/16 15:18, Richard Earnshaw (lists) wrote:
> On 20/07/16 14:03, Jiong Wang wrote:
>> Those stack adjustment sequences inside aarch64_expand_prologue/epilogue
>> are doing exactly what's aarch64_add_constant offered, except they also
>> need to be aware of dwarf generation.
>>
>> This patch teach existed aarch64_add_constant about dwarf generation and
>> currently SP register is supported.  Whenever SP is updated, there
>> should be CFA update, we then mark these instructions as frame related,
>> and if the update is too complex for gcc to guess the adjustment, we
>> attach explicit annotation.
>>
>> Both dwarf frame info size and pro/epilogue scheduling are improved after
>> this patch as aarch64_add_constant has better utilization of scratch
>> register.
>>
>> OK for trunk?
>>
>> gcc/
>> 2016-07-20  Jiong Wang  <jiong.wang@arm.com>
>>
>>              * config/aarch64/aarch64.c (aarch64_add_constant): Mark
>>              instruction as frame related when it is.  Generate CFA
>>              annotation when it's necessary.
>>              (aarch64_expand_prologue): Use aarch64_add_constant.
>>              (aarch64_expand_epilogue): Likewise.
>>
> Are you sure using aarch64_add_constant is unconditionally safe?  Stack
> adjustments need to be done very carefully to ensure that we never
> transiently deallocate part of the stack.

Richard,

   Thanks for the review, yes, I believe using aarch64_add_constant is 
unconditionally
safe here.  Because we have generated a stack tie to clobber the whole 
memory thus
prevent any instruction which access stack be scheduled after that.

   The access to deallocated stack issue was there and fixed by

   https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02292.html.

  aarch64_add_constant itself is generating the same instruction 
sequences as the
original code, except for a few cases, it will prefer

   move scratch_reg, #imm
   add sp, sp, scratch_reg

than:
   add sp, sp, #imm_part1
   add sp, sp, #imm_part2




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
  2016-07-20 15:02       ` Jiong Wang
@ 2016-07-20 15:09         ` Richard Earnshaw (lists)
  2016-07-21 10:08         ` Richard Earnshaw (lists)
  1 sibling, 0 replies; 9+ messages in thread
From: Richard Earnshaw (lists) @ 2016-07-20 15:09 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches

On 20/07/16 16:02, Jiong Wang wrote:
> On 20/07/16 15:18, Richard Earnshaw (lists) wrote:
>> On 20/07/16 14:03, Jiong Wang wrote:
>>> Those stack adjustment sequences inside aarch64_expand_prologue/epilogue
>>> are doing exactly what's aarch64_add_constant offered, except they also
>>> need to be aware of dwarf generation.
>>>
>>> This patch teach existed aarch64_add_constant about dwarf generation and
>>> currently SP register is supported.  Whenever SP is updated, there
>>> should be CFA update, we then mark these instructions as frame related,
>>> and if the update is too complex for gcc to guess the adjustment, we
>>> attach explicit annotation.
>>>
>>> Both dwarf frame info size and pro/epilogue scheduling are improved
>>> after
>>> this patch as aarch64_add_constant has better utilization of scratch
>>> register.
>>>
>>> OK for trunk?
>>>
>>> gcc/
>>> 2016-07-20  Jiong Wang  <jiong.wang@arm.com>
>>>
>>>              * config/aarch64/aarch64.c (aarch64_add_constant): Mark
>>>              instruction as frame related when it is.  Generate CFA
>>>              annotation when it's necessary.
>>>              (aarch64_expand_prologue): Use aarch64_add_constant.
>>>              (aarch64_expand_epilogue): Likewise.
>>>
>> Are you sure using aarch64_add_constant is unconditionally safe?  Stack
>> adjustments need to be done very carefully to ensure that we never
>> transiently deallocate part of the stack.
> 
> Richard,
> 
>   Thanks for the review, yes, I believe using aarch64_add_constant is
> unconditionally
> safe here.  Because we have generated a stack tie to clobber the whole
> memory thus
> prevent any instruction which access stack be scheduled after that.
> 
>   The access to deallocated stack issue was there and fixed by
> 
>   https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02292.html.
> 
>  aarch64_add_constant itself is generating the same instruction
> sequences as the
> original code, except for a few cases, it will prefer
> 
>   move scratch_reg, #imm
>   add sp, sp, scratch_reg
> 
> than:
>   add sp, sp, #imm_part1
>   add sp, sp, #imm_part2
> 
> 
> 
> 

But can you guarantee we will never get and add and a sub in a single
adjustment?  If not, then we need to ensure the two instructions appear
in the right order.

R.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
  2016-07-20 15:02       ` Jiong Wang
  2016-07-20 15:09         ` Richard Earnshaw (lists)
@ 2016-07-21 10:08         ` Richard Earnshaw (lists)
  2016-07-25  9:34           ` Jiong Wang
  1 sibling, 1 reply; 9+ messages in thread
From: Richard Earnshaw (lists) @ 2016-07-21 10:08 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches

On 20/07/16 16:02, Jiong Wang wrote:
> On 20/07/16 15:18, Richard Earnshaw (lists) wrote:
>> On 20/07/16 14:03, Jiong Wang wrote:
>>> Those stack adjustment sequences inside aarch64_expand_prologue/epilogue
>>> are doing exactly what's aarch64_add_constant offered, except they also
>>> need to be aware of dwarf generation.
>>>
>>> This patch teach existed aarch64_add_constant about dwarf generation and
>>> currently SP register is supported.  Whenever SP is updated, there
>>> should be CFA update, we then mark these instructions as frame related,
>>> and if the update is too complex for gcc to guess the adjustment, we
>>> attach explicit annotation.
>>>
>>> Both dwarf frame info size and pro/epilogue scheduling are improved
>>> after
>>> this patch as aarch64_add_constant has better utilization of scratch
>>> register.
>>>
>>> OK for trunk?
>>>
>>> gcc/
>>> 2016-07-20  Jiong Wang  <jiong.wang@arm.com>
>>>
>>>              * config/aarch64/aarch64.c (aarch64_add_constant): Mark
>>>              instruction as frame related when it is.  Generate CFA
>>>              annotation when it's necessary.
>>>              (aarch64_expand_prologue): Use aarch64_add_constant.
>>>              (aarch64_expand_epilogue): Likewise.
>>>
>> Are you sure using aarch64_add_constant is unconditionally safe?  Stack
>> adjustments need to be done very carefully to ensure that we never
>> transiently deallocate part of the stack.
> 
> Richard,
> 
>   Thanks for the review, yes, I believe using aarch64_add_constant is
> unconditionally
> safe here.  Because we have generated a stack tie to clobber the whole
> memory thus
> prevent any instruction which access stack be scheduled after that.
> 
>   The access to deallocated stack issue was there and fixed by
> 
>   https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02292.html.
> 
>  aarch64_add_constant itself is generating the same instruction
> sequences as the
> original code, except for a few cases, it will prefer
> 
>   move scratch_reg, #imm
>   add sp, sp, scratch_reg
> 
> than:
>   add sp, sp, #imm_part1
>   add sp, sp, #imm_part2
> 
> 
> 
> 

OK, I've had another look at this and I'm happy that we don't
(currently) run into the problem I'm concerned about.

However, this new usage does impose a constraint on aarch64_add_constant
that will need to be respected in future, so please can you add the
following to the comment that precedes that function:

/* ...

   This function is sometimes used to adjust the stack pointer, so we
   must ensure that it can never cause transient stack deallocation
   by writing an invalid value into REGNUM.  */


> +  bool frame_related_p = (regnum == SP_REGNUM);

I think it would be better to make the frame-related decision be an
explicit parameter passed to the routine (don't forget SP is not always
the frame pointer).  Then the new uses would pass 'true' and the
existing uses 'false'.

R.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
  2016-07-21 10:08         ` Richard Earnshaw (lists)
@ 2016-07-25  9:34           ` Jiong Wang
  2016-07-25 13:08             ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 9+ messages in thread
From: Jiong Wang @ 2016-07-25  9:34 UTC (permalink / raw)
  To: Richard Earnshaw (lists); +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2401 bytes --]

On 21/07/16 11:08, Richard Earnshaw (lists) wrote:
> On 20/07/16 16:02, Jiong Wang wrote:
>> Richard,
>>    Thanks for the review, yes, I believe using aarch64_add_constant is
>> unconditionally
>> safe here.  Because we have generated a stack tie to clobber the whole
>> memory thus
>> prevent any instruction which access stack be scheduled after that.
>>
>>    The access to deallocated stack issue was there and fixed by
>>
>>    https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02292.html.
>>
>>   aarch64_add_constant itself is generating the same instruction
>> sequences as the
>> original code, except for a few cases, it will prefer
>>
>>    move scratch_reg, #imm
>>    add sp, sp, scratch_reg
>>
>> than:
>>    add sp, sp, #imm_part1
>>    add sp, sp, #imm_part2
>>
>>
>>
>>
> OK, I've had another look at this and I'm happy that we don't
> (currently) run into the problem I'm concerned about.
>
> However, this new usage does impose a constraint on aarch64_add_constant
> that will need to be respected in future, so please can you add the
> following to the comment that precedes that function:
>
> /* ...
>
>     This function is sometimes used to adjust the stack pointer, so we
>     must ensure that it can never cause transient stack deallocation
>     by writing an invalid value into REGNUM.  */
>
>
>> +  bool frame_related_p = (regnum == SP_REGNUM);
> I think it would be better to make the frame-related decision be an
> explicit parameter passed to the routine (don't forget SP is not always
> the frame pointer).  Then the new uses would pass 'true' and the
> existing uses 'false'.
>
> R.

Thanks, attachment is the updated patch which:

   * Added above new comments for aarch64_add_constant.
   * One new parameter "frame_related_p" for aarch64_add_constant.
     I thought adding new gcc assertion for sanity check of
     frame_related_p and REGNUM, haven't done that as I found dwarf2cfi.c
     is doing that.

OK for trunk?

gcc/
2016-07-25  Jiong Wang  <jiong.wang@arm.com>

              * config/aarch64/aarch64.c (aarch64_add_constant): New
              parameter "frame_related_p".  Generate CFA annotation when
              it's necessary.
              (aarch64_expand_prologue): Use aarch64_add_constant.
              (aarch64_expand_epilogue): Likewise.
              (aarch64_output_mi_thunk): Pass "false" when calling
              aarch64_add_constant.


[-- Attachment #2: update.patch --]
[-- Type: text/x-patch, Size: 5380 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 41844a1..ca93f6e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1866,14 +1866,19 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
 }
 
 /* Add DELTA onto REGNUM in MODE, using SCRATCHREG to held intermediate value if
-   it is necessary.  */
+   it is necessary.
+
+   This function is sometimes used to adjust the stack pointer, so we must
+   ensure that it can never cause transient stack deallocation by writing an
+   invalid value into REGNUM.  */
 
 static void
 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
-		      HOST_WIDE_INT delta)
+		      HOST_WIDE_INT delta, bool frame_related_p)
 {
   HOST_WIDE_INT mdelta = abs_hwi (delta);
   rtx this_rtx = gen_rtx_REG (mode, regnum);
+  rtx_insn *insn;
 
   /* Do nothing if mdelta is zero.  */
   if (!mdelta)
@@ -1882,7 +1887,8 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
   /* We only need single instruction if the offset fit into add/sub.  */
   if (aarch64_uimm12_shift (mdelta))
     {
-      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
       return;
     }
 
@@ -1895,15 +1901,23 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
       HOST_WIDE_INT low_off = mdelta & 0xfff;
 
       low_off = delta < 0 ? -low_off : low_off;
-      emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
-      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
       return;
     }
 
   /* Otherwise use generic function to handle all other situations.  */
   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
   aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
-  emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
+  insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
+  if (frame_related_p)
+    {
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      rtx adj = plus_constant (mode, this_rtx, delta);
+      add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
+    }
 }
 
 static bool
@@ -3038,36 +3052,7 @@ aarch64_expand_prologue (void)
       frame_size -= (offset + crtl->outgoing_args_size);
       fp_offset = 0;
 
-      if (frame_size >= 0x1000000)
-	{
-	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-	  emit_move_insn (op0, GEN_INT (-frame_size));
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-
-	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
-			gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    -frame_size)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
-      else if (frame_size > 0)
-	{
-	  int hi_ofs = frame_size & 0xfff000;
-	  int lo_ofs = frame_size & 0x000fff;
-
-	  if (hi_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	  if (lo_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	    }
-	}
+      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size, true);
     }
   else
     frame_size = -1;
@@ -3287,31 +3272,7 @@ aarch64_expand_epilogue (bool for_sibcall)
       if (need_barrier_p)
 	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
 
-      if (frame_size >= 0x1000000)
-	{
-	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-	  emit_move_insn (op0, GEN_INT (frame_size));
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-	}
-      else
-	{
-          int hi_ofs = frame_size & 0xfff000;
-          int lo_ofs = frame_size & 0x000fff;
-
-	  if (hi_ofs && lo_ofs)
-	    {
-	      insn = emit_insn (gen_add2_insn
-				(stack_pointer_rtx, GEN_INT (hi_ofs)));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      frame_size = lo_ofs;
-	    }
-	  insn = emit_insn (gen_add2_insn
-			    (stack_pointer_rtx, GEN_INT (frame_size)));
-	}
-
-      /* Reset the CFA to be SP + 0.  */
-      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
-      RTX_FRAME_RELATED_P (insn) = 1;
+      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size, true);
     }
 
   /* Stack adjustment for exception handler.  */
@@ -3398,7 +3359,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   emit_note (NOTE_INSN_PROLOGUE_END);
 
   if (vcall_offset == 0)
-    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
+    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
   else
     {
       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
@@ -3414,7 +3375,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
 				       plus_constant (Pmode, this_rtx, delta));
 	  else
-	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
+	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
 	}
 
       if (Pmode == ptr_mode)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant
  2016-07-25  9:34           ` Jiong Wang
@ 2016-07-25 13:08             ` Richard Earnshaw (lists)
  0 siblings, 0 replies; 9+ messages in thread
From: Richard Earnshaw (lists) @ 2016-07-25 13:08 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches

On 25/07/16 10:34, Jiong Wang wrote:
> On 21/07/16 11:08, Richard Earnshaw (lists) wrote:
>> On 20/07/16 16:02, Jiong Wang wrote:
>>> Richard,
>>>    Thanks for the review, yes, I believe using aarch64_add_constant is
>>> unconditionally
>>> safe here.  Because we have generated a stack tie to clobber the whole
>>> memory thus
>>> prevent any instruction which access stack be scheduled after that.
>>>
>>>    The access to deallocated stack issue was there and fixed by
>>>
>>>    https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02292.html.
>>>
>>>   aarch64_add_constant itself is generating the same instruction
>>> sequences as the
>>> original code, except for a few cases, it will prefer
>>>
>>>    move scratch_reg, #imm
>>>    add sp, sp, scratch_reg
>>>
>>> than:
>>>    add sp, sp, #imm_part1
>>>    add sp, sp, #imm_part2
>>>
>>>
>>>
>>>
>> OK, I've had another look at this and I'm happy that we don't
>> (currently) run into the problem I'm concerned about.
>>
>> However, this new usage does impose a constraint on aarch64_add_constant
>> that will need to be respected in future, so please can you add the
>> following to the comment that precedes that function:
>>
>> /* ...
>>
>>     This function is sometimes used to adjust the stack pointer, so we
>>     must ensure that it can never cause transient stack deallocation
>>     by writing an invalid value into REGNUM.  */
>>
>>
>>> +  bool frame_related_p = (regnum == SP_REGNUM);
>> I think it would be better to make the frame-related decision be an
>> explicit parameter passed to the routine (don't forget SP is not always
>> the frame pointer).  Then the new uses would pass 'true' and the
>> existing uses 'false'.
>>
>> R.
> 
> Thanks, attachment is the updated patch which:
> 
>   * Added above new comments for aarch64_add_constant.
>   * One new parameter "frame_related_p" for aarch64_add_constant.
>     I thought adding new gcc assertion for sanity check of
>     frame_related_p and REGNUM, haven't done that as I found dwarf2cfi.c
>     is doing that.
> 
> OK for trunk?

This is fine, thanks.

R.

> 
> gcc/
> 2016-07-25  Jiong Wang  <jiong.wang@arm.com>
> 
>              * config/aarch64/aarch64.c (aarch64_add_constant): New
>              parameter "frame_related_p".  Generate CFA annotation when
>              it's necessary.
>              (aarch64_expand_prologue): Use aarch64_add_constant.
>              (aarch64_expand_epilogue): Likewise.
>              (aarch64_output_mi_thunk): Pass "false" when calling
>              aarch64_add_constant.
> 
> 
> update.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 41844a1..ca93f6e 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1866,14 +1866,19 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>  }
>  
>  /* Add DELTA onto REGNUM in MODE, using SCRATCHREG to held intermediate value if
> -   it is necessary.  */
> +   it is necessary.
> +
> +   This function is sometimes used to adjust the stack pointer, so we must
> +   ensure that it can never cause transient stack deallocation by writing an
> +   invalid value into REGNUM.  */
>  
>  static void
>  aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
> -		      HOST_WIDE_INT delta)
> +		      HOST_WIDE_INT delta, bool frame_related_p)
>  {
>    HOST_WIDE_INT mdelta = abs_hwi (delta);
>    rtx this_rtx = gen_rtx_REG (mode, regnum);
> +  rtx_insn *insn;
>  
>    /* Do nothing if mdelta is zero.  */
>    if (!mdelta)
> @@ -1882,7 +1887,8 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
>    /* We only need single instruction if the offset fit into add/sub.  */
>    if (aarch64_uimm12_shift (mdelta))
>      {
> -      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
> +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
>        return;
>      }
>  
> @@ -1895,15 +1901,23 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
>        HOST_WIDE_INT low_off = mdelta & 0xfff;
>  
>        low_off = delta < 0 ? -low_off : low_off;
> -      emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
> -      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
> +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
> +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
>        return;
>      }
>  
>    /* Otherwise use generic function to handle all other situations.  */
>    rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
>    aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
> -  emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
> +  insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
> +  if (frame_related_p)
> +    {
> +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
> +      rtx adj = plus_constant (mode, this_rtx, delta);
> +      add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
> +    }
>  }
>  
>  static bool
> @@ -3038,36 +3052,7 @@ aarch64_expand_prologue (void)
>        frame_size -= (offset + crtl->outgoing_args_size);
>        fp_offset = 0;
>  
> -      if (frame_size >= 0x1000000)
> -	{
> -	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
> -	  emit_move_insn (op0, GEN_INT (-frame_size));
> -	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
> -
> -	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
> -			gen_rtx_SET (stack_pointer_rtx,
> -				     plus_constant (Pmode, stack_pointer_rtx,
> -						    -frame_size)));
> -	  RTX_FRAME_RELATED_P (insn) = 1;
> -	}
> -      else if (frame_size > 0)
> -	{
> -	  int hi_ofs = frame_size & 0xfff000;
> -	  int lo_ofs = frame_size & 0x000fff;
> -
> -	  if (hi_ofs)
> -	    {
> -	      insn = emit_insn (gen_add2_insn
> -				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	    }
> -	  if (lo_ofs)
> -	    {
> -	      insn = emit_insn (gen_add2_insn
> -				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	    }
> -	}
> +      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size, true);
>      }
>    else
>      frame_size = -1;
> @@ -3287,31 +3272,7 @@ aarch64_expand_epilogue (bool for_sibcall)
>        if (need_barrier_p)
>  	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
>  
> -      if (frame_size >= 0x1000000)
> -	{
> -	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
> -	  emit_move_insn (op0, GEN_INT (frame_size));
> -	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
> -	}
> -      else
> -	{
> -          int hi_ofs = frame_size & 0xfff000;
> -          int lo_ofs = frame_size & 0x000fff;
> -
> -	  if (hi_ofs && lo_ofs)
> -	    {
> -	      insn = emit_insn (gen_add2_insn
> -				(stack_pointer_rtx, GEN_INT (hi_ofs)));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	      frame_size = lo_ofs;
> -	    }
> -	  insn = emit_insn (gen_add2_insn
> -			    (stack_pointer_rtx, GEN_INT (frame_size)));
> -	}
> -
> -      /* Reset the CFA to be SP + 0.  */
> -      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
> -      RTX_FRAME_RELATED_P (insn) = 1;
> +      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size, true);
>      }
>  
>    /* Stack adjustment for exception handler.  */
> @@ -3398,7 +3359,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
>    emit_note (NOTE_INSN_PROLOGUE_END);
>  
>    if (vcall_offset == 0)
> -    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
> +    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
>    else
>      {
>        gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
> @@ -3414,7 +3375,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
>  	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
>  				       plus_constant (Pmode, this_rtx, delta));
>  	  else
> -	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
> +	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
>  	}
>  
>        if (Pmode == ptr_mode)
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [AArch64][2/3] Optimize aarch64_add_constant to generate better addition sequences
       [not found] <626096f0-957f-98bd-5efa-faa0c14eb5ab@foss.arm.com>
       [not found] ` <b653851c-9587-a498-a8da-8d235b4ddbcc@foss.arm.com>
@ 2016-07-20 13:03 ` Jiong Wang
  2016-07-20 14:13   ` Richard Earnshaw (lists)
  1 sibling, 1 reply; 9+ messages in thread
From: Jiong Wang @ 2016-07-20 13:03 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1332 bytes --]

This patch optimize immediate addition sequences generated by
aarch64_add_constant.

The current addition sequences generated are:

   * If immediate fit into unsigned 12bit range, generate single add/sub.
     
   * Otherwise if it fit into unsigned 24bit range, generate two add/sub.

   * Otherwise invoke general constant build function.


This haven't considered the situation where immedate can't fit into
unsigned 12bit range, but can fit into single mov instruction for which
case we generate one move and one addition.  The move won't touch the
destination register thus the sequences is better than two additions
which both touch the destination register.


This patch thus optimize the addition sequences into:

   * If immediate fit into unsigned 12bit range, generate single add/sub.
  
   * Otherwise if it fit into unsigned 24bit range, generate two add/sub.
     And don't do this if it fit into single move instruction, in which case
     move the immedaite to scratch register firstly, then generate one
     addition to add the scratch register to the destination register.
     
   * Otherwise invoke general constant build function.


OK for trunk?

gcc/
2016-07-20  Jiong Wang  <jiong.wang@arm.com>

             * config/aarch64/aarch64.c (aarch64_add_constant): Optimize
             instruction sequences.


[-- Attachment #2: build-const-2.patch --]
[-- Type: text/x-patch, Size: 3283 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index aeea3b3ebc514663043ac8d7cd13361f06f78502..41844a101247c939ecb31f8a8c17cf79759255aa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1865,6 +1865,47 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
 }
 
+/* Add DELTA onto REGNUM in MODE, using SCRATCHREG to held intermediate value if
+   it is necessary.  */
+
+static void
+aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
+		      HOST_WIDE_INT delta)
+{
+  HOST_WIDE_INT mdelta = abs_hwi (delta);
+  rtx this_rtx = gen_rtx_REG (mode, regnum);
+
+  /* Do nothing if mdelta is zero.  */
+  if (!mdelta)
+    return;
+
+  /* We only need single instruction if the offset fit into add/sub.  */
+  if (aarch64_uimm12_shift (mdelta))
+    {
+      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
+      return;
+    }
+
+  /* We need two add/sub instructions, each one perform part of the
+     addition/subtraction, but don't this if the addend can be loaded into
+     register by single instruction, in that case we prefer a move to scratch
+     register following by addition.  */
+  if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
+    {
+      HOST_WIDE_INT low_off = mdelta & 0xfff;
+
+      low_off = delta < 0 ? -low_off : low_off;
+      emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
+      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
+      return;
+    }
+
+  /* Otherwise use generic function to handle all other situations.  */
+  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
+  aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
+  emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
+}
+
 static bool
 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
 				 tree exp ATTRIBUTE_UNUSED)
@@ -3337,44 +3378,6 @@ aarch64_final_eh_return_addr (void)
 				       - 2 * UNITS_PER_WORD));
 }
 
-static void
-aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
-		      HOST_WIDE_INT delta)
-{
-  HOST_WIDE_INT mdelta = delta;
-  rtx this_rtx = gen_rtx_REG (mode, regnum);
-  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
-
-  if (mdelta < 0)
-    mdelta = -mdelta;
-
-  if (mdelta >= 4096 * 4096)
-    {
-      aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
-      emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
-    }
-  else if (mdelta > 0)
-    {
-      if (mdelta >= 4096)
-	{
-	  emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
-	  rtx shift = gen_rtx_ASHIFT (mode, scratch_rtx, GEN_INT (12));
-	  if (delta < 0)
-	    emit_insn (gen_rtx_SET (this_rtx,
-				    gen_rtx_MINUS (mode, this_rtx, shift)));
-	  else
-	    emit_insn (gen_rtx_SET (this_rtx,
-				    gen_rtx_PLUS (mode, this_rtx, shift)));
-	}
-      if (mdelta % 4096 != 0)
-	{
-	  scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
-	  emit_insn (gen_rtx_SET (this_rtx,
-				  gen_rtx_PLUS (mode, this_rtx, scratch_rtx)));
-	}
-    }
-}
-
 /* Output code to add DELTA to the first argument, and then jump
    to FUNCTION.  Used for C++ multiple inheritance.  */
 static void

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [AArch64][2/3] Optimize aarch64_add_constant to generate better addition sequences
  2016-07-20 13:03 ` [AArch64][2/3] Optimize aarch64_add_constant to generate better addition sequences Jiong Wang
@ 2016-07-20 14:13   ` Richard Earnshaw (lists)
  0 siblings, 0 replies; 9+ messages in thread
From: Richard Earnshaw (lists) @ 2016-07-20 14:13 UTC (permalink / raw)
  To: Jiong Wang, GCC Patches

On 20/07/16 14:02, Jiong Wang wrote:
> This patch optimize immediate addition sequences generated by
> aarch64_add_constant.
> 
> The current addition sequences generated are:
> 
>   * If immediate fit into unsigned 12bit range, generate single add/sub.
>       * Otherwise if it fit into unsigned 24bit range, generate two
> add/sub.
> 
>   * Otherwise invoke general constant build function.
> 
> 
> This haven't considered the situation where immedate can't fit into
> unsigned 12bit range, but can fit into single mov instruction for which
> case we generate one move and one addition.  The move won't touch the
> destination register thus the sequences is better than two additions
> which both touch the destination register.
> 
> 
> This patch thus optimize the addition sequences into:
> 
>   * If immediate fit into unsigned 12bit range, generate single add/sub.
>  
>   * Otherwise if it fit into unsigned 24bit range, generate two add/sub.
>     And don't do this if it fit into single move instruction, in which case
>     move the immedaite to scratch register firstly, then generate one
>     addition to add the scratch register to the destination register.
>       * Otherwise invoke general constant build function.
> 
> 
> OK for trunk?
> 
> gcc/
> 2016-07-20  Jiong Wang  <jiong.wang@arm.com>
> 
>             * config/aarch64/aarch64.c (aarch64_add_constant): Optimize
>             instruction sequences.
> 
> 

OK with the updates to the comments as mentioned below.

> build-const-2.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index aeea3b3ebc514663043ac8d7cd13361f06f78502..41844a101247c939ecb31f8a8c17cf79759255aa 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1865,6 +1865,47 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>    aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
>  }
>  
> +/* Add DELTA onto REGNUM in MODE, using SCRATCHREG to held intermediate value if
> +   it is necessary.  */

Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold an
intermediate value if necessary.


> +
> +static void
> +aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
> +		      HOST_WIDE_INT delta)
> +{
> +  HOST_WIDE_INT mdelta = abs_hwi (delta);
> +  rtx this_rtx = gen_rtx_REG (mode, regnum);
> +
> +  /* Do nothing if mdelta is zero.  */
> +  if (!mdelta)
> +    return;
> +
> +  /* We only need single instruction if the offset fit into add/sub.  */
> +  if (aarch64_uimm12_shift (mdelta))
> +    {
> +      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
> +      return;
> +    }
> +
> +  /* We need two add/sub instructions, each one perform part of the
> +     addition/subtraction, but don't this if the addend can be loaded into
> +     register by single instruction, in that case we prefer a move to scratch
> +     register following by addition.  */

We need two add/sub instructions, each one performing part of the
calculation.  Don't do this if the addend can be loaded into
register with a single instruction, in that case we prefer a move to a
scratch register following by an addition.



> +  if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
> +    {
> +      HOST_WIDE_INT low_off = mdelta & 0xfff;
> +
> +      low_off = delta < 0 ? -low_off : low_off;
> +      emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
> +      emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
> +      return;
> +    }
> +
> +  /* Otherwise use generic function to handle all other situations.  */
> +  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
> +  aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
> +  emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
> +}
> +
>  static bool
>  aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
>  				 tree exp ATTRIBUTE_UNUSED)
> @@ -3337,44 +3378,6 @@ aarch64_final_eh_return_addr (void)
>  				       - 2 * UNITS_PER_WORD));
>  }
>  
> -static void
> -aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
> -		      HOST_WIDE_INT delta)
> -{
> -  HOST_WIDE_INT mdelta = delta;
> -  rtx this_rtx = gen_rtx_REG (mode, regnum);
> -  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
> -
> -  if (mdelta < 0)
> -    mdelta = -mdelta;
> -
> -  if (mdelta >= 4096 * 4096)
> -    {
> -      aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
> -      emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
> -    }
> -  else if (mdelta > 0)
> -    {
> -      if (mdelta >= 4096)
> -	{
> -	  emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
> -	  rtx shift = gen_rtx_ASHIFT (mode, scratch_rtx, GEN_INT (12));
> -	  if (delta < 0)
> -	    emit_insn (gen_rtx_SET (this_rtx,
> -				    gen_rtx_MINUS (mode, this_rtx, shift)));
> -	  else
> -	    emit_insn (gen_rtx_SET (this_rtx,
> -				    gen_rtx_PLUS (mode, this_rtx, shift)));
> -	}
> -      if (mdelta % 4096 != 0)
> -	{
> -	  scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
> -	  emit_insn (gen_rtx_SET (this_rtx,
> -				  gen_rtx_PLUS (mode, this_rtx, scratch_rtx)));
> -	}
> -    }
> -}
> -
>  /* Output code to add DELTA to the first argument, and then jump
>     to FUNCTION.  Used for C++ multiple inheritance.  */
>  static void
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-07-25 13:08 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <626096f0-957f-98bd-5efa-faa0c14eb5ab@foss.arm.com>
     [not found] ` <b653851c-9587-a498-a8da-8d235b4ddbcc@foss.arm.com>
2016-07-20 13:03   ` [AArch64][3/3] Migrate aarch64_expand_prologue/epilogue to aarch64_add_constant Jiong Wang
2016-07-20 14:19     ` Richard Earnshaw (lists)
2016-07-20 15:02       ` Jiong Wang
2016-07-20 15:09         ` Richard Earnshaw (lists)
2016-07-21 10:08         ` Richard Earnshaw (lists)
2016-07-25  9:34           ` Jiong Wang
2016-07-25 13:08             ` Richard Earnshaw (lists)
2016-07-20 13:03 ` [AArch64][2/3] Optimize aarch64_add_constant to generate better addition sequences Jiong Wang
2016-07-20 14:13   ` Richard Earnshaw (lists)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).