public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][AArch64][2/5] Improve immediate generation
@ 2015-09-02 12:35 Wilco Dijkstra
  2015-09-18 14:26 ` James Greenhalgh
  0 siblings, 1 reply; 2+ messages in thread
From: Wilco Dijkstra @ 2015-09-02 12:35 UTC (permalink / raw)
  To: 'GCC Patches'

aarch64_internal_mov_immediate uses loops iterating over all legal bitmask immediates to find
2-instruction immediate combinations. One loop is quadratic and despite being extremely expensive
very rarely finds a matching immediate (43 matches in all of SPEC2006 but none are emitted in final
code), so it can be removed without any effect on code quality. The other loop can be replaced by a
constant-time search: rather than iterating over all legal bitmask values, reconstruct a potential
bitmask and query the fast aarch64_bitmask_imm.

No change in generated code, passes GCC regression tests/bootstrap.

ChangeLog:
2015-09-02  Wilco Dijkstra  <wdijkstr@arm.com>

	* gcc/config/aarch64/aarch64.c (aarch64_internal_mov_immediate):
	Replace slow immediate matching loops with a faster algorithm.

---
 gcc/config/aarch64/aarch64.c | 96 +++++++++++---------------------------------
 1 file changed, 23 insertions(+), 73 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c0280e6..d6f7cb0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1376,7 +1376,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   unsigned HOST_WIDE_INT mask;
   int i;
   bool first;
-  unsigned HOST_WIDE_INT val;
+  unsigned HOST_WIDE_INT val, val2;
   bool subtargets;
   rtx subtarget;
   int one_match, zero_match, first_not_ffff_match;
@@ -1503,85 +1503,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 	}
     }
 
-  /* See if we can do it by arithmetically combining two
-     immediates.  */
-  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
+  if (zero_match != 2 && one_match != 2)
     {
-      int j;
-      mask = 0xffff;
+      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
+	 For a 64-bit bitmask try whether changing 16 bits to all ones or
+	 zeroes creates a valid bitmask.  To check any repeated bitmask,
+	 try using 16 bits from the other 32-bit half of val.  */
 
-      if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
-	  || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
+      for (i = 0; i < 64; i += 16, mask <<= 16)
 	{
-	  if (generate)
-	    {
-	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
-	      emit_insn (gen_rtx_SET (subtarget,
-				      GEN_INT (aarch64_bitmasks[i])));
-	      emit_insn (gen_adddi3 (dest, subtarget,
-				     GEN_INT (val - aarch64_bitmasks[i])));
-	    }
-	  num_insns += 2;
-	  return num_insns;
+	  val2 = val & ~mask;
+	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	    break;
+	  val2 = val | mask;
+	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	    break;
+	  val2 = val2 & ~mask;
+	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
+	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	    break;
 	}
-
-      for (j = 0; j < 64; j += 16, mask <<= 16)
+      if (i != 64)
 	{
-	  if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
+	  if (generate)
 	    {
-	      if (generate)
-		{
-		  emit_insn (gen_rtx_SET (dest,
-					  GEN_INT (aarch64_bitmasks[i])));
-		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
-					     GEN_INT ((val >> j) & 0xffff)));
-		}
-	      num_insns += 2;
-	      return num_insns;
+	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+			 GEN_INT ((val >> i) & 0xffff)));
 	    }
-	}
-    }
-
-  /* See if we can do it by logically combining two immediates.  */
-  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
-    {
-      if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
-	{
-	  int j;
-
-	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
-	    if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
-	      {
-		if (generate)
-		  {
-		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
-		    emit_insn (gen_rtx_SET (subtarget,
-					    GEN_INT (aarch64_bitmasks[i])));
-		    emit_insn (gen_iordi3 (dest, subtarget,
-					   GEN_INT (aarch64_bitmasks[j])));
-		  }
-		num_insns += 2;
-		return num_insns;
-	      }
-	}
-      else if ((val & aarch64_bitmasks[i]) == val)
-	{
-	  int j;
-
-	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
-	    if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
-	      {
-		if (generate)
-		  {
-		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
-		    emit_insn (gen_rtx_SET (subtarget,
-					    GEN_INT (aarch64_bitmasks[j])));
-		    emit_insn (gen_anddi3 (dest, subtarget,
-					   GEN_INT (aarch64_bitmasks[i])));
-		  }
-		num_insns += 2;
-		return num_insns;
-	      }
+	  return 2;
 	}
     }
 
-- 
1.8.3


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH][AArch64][2/5] Improve immediate generation
  2015-09-02 12:35 [PATCH][AArch64][2/5] Improve immediate generation Wilco Dijkstra
@ 2015-09-18 14:26 ` James Greenhalgh
  0 siblings, 0 replies; 2+ messages in thread
From: James Greenhalgh @ 2015-09-18 14:26 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: 'GCC Patches'

On Wed, Sep 02, 2015 at 01:35:19PM +0100, Wilco Dijkstra wrote:
> aarch64_internal_mov_immediate uses loops iterating over all legal bitmask
> immediates to find 2-instruction immediate combinations. One loop is
> quadratic and despite being extremely expensive very rarely finds a matching
> immediate (43 matches in all of SPEC2006 but none are emitted in final code),
> so it can be removed without any effect on code quality. The other loop can
> be replaced by a constant-time search: rather than iterating over all legal
> bitmask values, reconstruct a potential bitmask and query the fast
> aarch64_bitmask_imm.
> 
> No change in generated code, passes GCC regression tests/bootstrap.

Well, presumably those 43 cases in SPEC2006 changed...

The code looks OK, and I haven't seen any objections from Marcus or
Richard on the overall direction of the patch set.

OK for trunk.

Thanks,
James

> 
> ChangeLog:
> 2015-09-02  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* gcc/config/aarch64/aarch64.c (aarch64_internal_mov_immediate):
> 	Replace slow immediate matching loops with a faster algorithm.
> 
> ---
>  gcc/config/aarch64/aarch64.c | 96 +++++++++++---------------------------------
>  1 file changed, 23 insertions(+), 73 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index c0280e6..d6f7cb0 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1376,7 +1376,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>    unsigned HOST_WIDE_INT mask;
>    int i;
>    bool first;
> -  unsigned HOST_WIDE_INT val;
> +  unsigned HOST_WIDE_INT val, val2;
>    bool subtargets;
>    rtx subtarget;
>    int one_match, zero_match, first_not_ffff_match;
> @@ -1503,85 +1503,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>  	}
>      }
>  
> -  /* See if we can do it by arithmetically combining two
> -     immediates.  */
> -  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
> +  if (zero_match != 2 && one_match != 2)
>      {
> -      int j;
> -      mask = 0xffff;
> +      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
> +	 For a 64-bit bitmask try whether changing 16 bits to all ones or
> +	 zeroes creates a valid bitmask.  To check any repeated bitmask,
> +	 try using 16 bits from the other 32-bit half of val.  */
>  
> -      if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
> -	  || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
> +      for (i = 0; i < 64; i += 16, mask <<= 16)
>  	{
> -	  if (generate)
> -	    {
> -	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
> -	      emit_insn (gen_rtx_SET (subtarget,
> -				      GEN_INT (aarch64_bitmasks[i])));
> -	      emit_insn (gen_adddi3 (dest, subtarget,
> -				     GEN_INT (val - aarch64_bitmasks[i])));
> -	    }
> -	  num_insns += 2;
> -	  return num_insns;
> +	  val2 = val & ~mask;
> +	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +	    break;
> +	  val2 = val | mask;
> +	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +	    break;
> +	  val2 = val2 & ~mask;
> +	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> +	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +	    break;
>  	}
> -
> -      for (j = 0; j < 64; j += 16, mask <<= 16)
> +      if (i != 64)
>  	{
> -	  if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
> +	  if (generate)
>  	    {
> -	      if (generate)
> -		{
> -		  emit_insn (gen_rtx_SET (dest,
> -					  GEN_INT (aarch64_bitmasks[i])));
> -		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
> -					     GEN_INT ((val >> j) & 0xffff)));
> -		}
> -	      num_insns += 2;
> -	      return num_insns;
> +	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +			 GEN_INT ((val >> i) & 0xffff)));
>  	    }
> -	}
> -    }
> -
> -  /* See if we can do it by logically combining two immediates.  */
> -  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
> -    {
> -      if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
> -	{
> -	  int j;
> -
> -	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
> -	    if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
> -	      {
> -		if (generate)
> -		  {
> -		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
> -		    emit_insn (gen_rtx_SET (subtarget,
> -					    GEN_INT (aarch64_bitmasks[i])));
> -		    emit_insn (gen_iordi3 (dest, subtarget,
> -					   GEN_INT (aarch64_bitmasks[j])));
> -		  }
> -		num_insns += 2;
> -		return num_insns;
> -	      }
> -	}
> -      else if ((val & aarch64_bitmasks[i]) == val)
> -	{
> -	  int j;
> -
> -	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
> -	    if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
> -	      {
> -		if (generate)
> -		  {
> -		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
> -		    emit_insn (gen_rtx_SET (subtarget,
> -					    GEN_INT (aarch64_bitmasks[j])));
> -		    emit_insn (gen_anddi3 (dest, subtarget,
> -					   GEN_INT (aarch64_bitmasks[i])));
> -		  }
> -		num_insns += 2;
> -		return num_insns;
> -	      }
> +	  return 2;
>  	}
>      }
>  
> -- 
> 1.8.3
> 
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-09-18 14:26 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-02 12:35 [PATCH][AArch64][2/5] Improve immediate generation Wilco Dijkstra
2015-09-18 14:26 ` James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).