public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][AArch64] Improve immediate expansion [PR106583]
@ 2022-10-04 17:22 Wilco Dijkstra
  2022-10-05  8:42 ` Richard Sandiford
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2022-10-04 17:22 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Sandiford

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  This reduces the number of 4-instruction
immediates in SPECINT/FP by 10-15%.

Passes regress, OK for commit?

gcc/ChangeLog:

	PR target/106583
	* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
	Add support for a bitmask immediate with 2 MOVKs.

gcc/testsuite:
	PR target/106583
	* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 926e81f028c82aac9a5fecc18f921f84399c24ae..1601d11710cb6132c80a77bb4fe2f8429519aa5a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5568,7 +5568,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  if (zero_match < 2 && one_match < 2)
     {
       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
@@ -5600,6 +5600,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 	}
     }
 
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+    {
+      mask = 0xffffffff;
+
+      for (i = 0; i < 64; i += 16)
+	{
+	  val2 = val & ~mask;
+	  if (aarch64_bitmask_imm (val2, mode))
+	    break;
+	  val2 = val | mask;
+	  if (aarch64_bitmask_imm (val2, mode))
+	    break;
+	  val2 = val2 & ~mask;
+	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
+	  if (aarch64_bitmask_imm (val2, mode))
+	    break;
+
+	  mask = (mask << 16) | (mask >> 48);
+	}
+
+      if (i != 64)
+	{
+	  if (generate)
+	    {
+	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+					 GEN_INT ((val >> i) & 0xffff)));
+	      i = (i + 16) & 63;
+	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+					 GEN_INT ((val >> i) & 0xffff)));
+	    }
+
+	  return 3;
+	}
+    }
+
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
      otherwise skip zero bits.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
new file mode 100644
index 0000000000000000000000000000000000000000..f0a027a0950e506d4ddaacce5e151f57070948dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x7efefefefefefeff;
+}
+
+long f2 (void)
+{
+  return 0x12345678aaaaaaaa;
+}
+
+long f3 (void)
+{
+  return 0x1234cccccccc5678;
+}
+
+long f4 (void)
+{
+  return 0x7777123456787777;
+}
+
+long f5 (void)
+{
+  return 0x5555555512345678;
+}
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 10 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-04 17:22 [PATCH][AArch64] Improve immediate expansion [PR106583] Wilco Dijkstra
@ 2022-10-05  8:42 ` Richard Sandiford
  2022-10-06 17:29   ` Wilco Dijkstra
  0 siblings, 1 reply; 11+ messages in thread
From: Richard Sandiford @ 2022-10-05  8:42 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GCC Patches

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  This reduces the number of 4-instruction
> immediates in SPECINT/FP by 10-15%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>
>         PR target/106583
>         * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
>         Add support for a bitmask immediate with 2 MOVKs.
>
> gcc/testsuite:
>         PR target/106583
>         * gcc.target/aarch64/pr106583.c: Add new test.

Nice.

Did you consider handling the case where the movks aren't for
consecutive bitranges?  E.g. the patch handles:

  0x12345678aaaaaaaa

and:

  0x1234cccccccc5678

but it looks like it would be fairly easy to extend it to:

  0x1234cccc5678cccc

too.

Also, could you commonise:

	  val2 = val & ~mask;
	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
	    break;
	  val2 = val | mask;
	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
	    break;
	  val2 = val2 & ~mask;
	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
	    break;

?  It's subtle enough that IMO it'd be better not to cut-&-paste it.

Thanks,
Richard

> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 926e81f028c82aac9a5fecc18f921f84399c24ae..1601d11710cb6132c80a77bb4fe2f8429519aa5a 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -5568,7 +5568,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>    one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>      ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>
> -  if (zero_match != 2 && one_match != 2)
> +  if (zero_match < 2 && one_match < 2)
>      {
>        /* Try emitting a bitmask immediate with a movk replacing 16 bits.
>          For a 64-bit bitmask try whether changing 16 bits to all ones or
> @@ -5600,6 +5600,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>         }
>      }
>
> +  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
> +  if (zero_match + one_match == 0)
> +    {
> +      mask = 0xffffffff;
> +
> +      for (i = 0; i < 64; i += 16)
> +       {
> +         val2 = val & ~mask;
> +         if (aarch64_bitmask_imm (val2, mode))
> +           break;
> +         val2 = val | mask;
> +         if (aarch64_bitmask_imm (val2, mode))
> +           break;
> +         val2 = val2 & ~mask;
> +         val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> +         if (aarch64_bitmask_imm (val2, mode))
> +           break;
> +
> +         mask = (mask << 16) | (mask >> 48);
> +       }
> +
> +      if (i != 64)
> +       {
> +         if (generate)
> +           {
> +             emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +                                        GEN_INT ((val >> i) & 0xffff)));
> +             i = (i + 16) & 63;
> +             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +                                        GEN_INT ((val >> i) & 0xffff)));
> +           }
> +
> +         return 3;
> +       }
> +    }
> +
>    /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
>       are emitted by the initial mov.  If one_match > zero_match, skip set bits,
>       otherwise skip zero bits.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f0a027a0950e506d4ddaacce5e151f57070948dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> @@ -0,0 +1,30 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> +  return 0x7efefefefefefeff;
> +}
> +
> +long f2 (void)
> +{
> +  return 0x12345678aaaaaaaa;
> +}
> +
> +long f3 (void)
> +{
> +  return 0x1234cccccccc5678;
> +}
> +
> +long f4 (void)
> +{
> +  return 0x7777123456787777;
> +}
> +
> +long f5 (void)
> +{
> +  return 0x5555555512345678;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 10 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-05  8:42 ` Richard Sandiford
@ 2022-10-06 17:29   ` Wilco Dijkstra
  2022-10-07 13:17     ` Richard Sandiford
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2022-10-06 17:29 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: GCC Patches

Hi Richard,

> Did you consider handling the case where the movks aren't for
> consecutive bitranges?  E.g. the patch handles:

> but it looks like it would be fairly easy to extend it to:
>
>  0x1234cccc5678cccc

Yes, with a more general search loop we can get that case too -
it doesn't trigger much though. The code that checks for this is
now refactored into a new function. Given there are now many
more calls to aarch64_bitmask_imm, I added a streamlined internal
entry point that assumes the input is 64-bit.

Cheers,
Wilco

[PATCH v2][AArch64] Improve immediate expansion [PR106583]

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
efficiency of bitmask checks.  This reduces the number of 4-instruction
immediates in SPECINT/FP by 10-15%.

Passes regress, OK for commit?

gcc/ChangeLog:

	PR target/106583
	* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
	Add support for a bitmask immediate with 2 MOVKs.
        (aarch64_check_bitmask): New function after refactorization.
        (aarch64_replicate_bitmask_imm): Remove function, merge into...
        (aarch64_bitmask_imm): Simplify replication of small modes.
        Split function into 64-bit only version for efficiency. 

gcc/testsuite:
	PR target/106583
	* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
 					     factor, nelts_per_vq);
 }
 
+/* Return true if the immediate VAL can be a bitfield immediate
+   by changing the given MASK bits in VAL to zeroes, ones or bits
+   from the other half of VAL.  Return the new immediate in VAL2.  */
+static inline bool
+aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
+		       unsigned HOST_WIDE_INT &val2,
+		       unsigned HOST_WIDE_INT mask)
+{
+  val2 = val & ~mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val = val & ~mask;
+  val2 = val | (((val >> 32) | (val << 32)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | (((val >> 16) | (val << 48)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  return false;
+}
+
 static int
 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 				scalar_int_mode mode)
@@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  if (zero_match < 2 && one_match < 2)
     {
       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
 	 try using 16 bits from the other 32-bit half of val.  */
 
-      for (i = 0; i < 64; i += 16, mask <<= 16)
-	{
-	  val2 = val & ~mask;
-	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
-	    break;
-	  val2 = val | mask;
-	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
-	    break;
-	  val2 = val2 & ~mask;
-	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
-	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
-	    break;
-	}
-      if (i != 64)
-	{
-	  if (generate)
+      for (i = 0; i < 64; i += 16)
+	if (aarch64_check_bitmask (val, val2, mask << i))
+	  {
+	    if (generate)
+	      {
+		emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+		emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+					   GEN_INT ((val >> i) & 0xffff)));
+	      }
+	    return 2;
+	  }
+    }
+
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+    {
+      for (i = 0; i < 48; i += 16)
+	for (int j = i + 16; j < 64; j += 16)
+	  if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
 	    {
-	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
-	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
-					 GEN_INT ((val >> i) & 0xffff)));
+	      if (generate)
+		{
+		  emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+		  emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+					     GEN_INT ((val >> i) & 0xffff)));
+		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
+					       GEN_INT ((val >> j) & 0xffff)));
+		}
+	      return 3;
 	    }
-	  return 2;
-	}
     }
 
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
@@ -10168,22 +10200,6 @@ aarch64_movk_shift (const wide_int_ref &and_val,
   return -1;
 }
 
-/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
-   64-bit (DImode) integer.  */
-
-static unsigned HOST_WIDE_INT
-aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
-{
-  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
-  while (size < 64)
-    {
-      val &= (HOST_WIDE_INT_1U << size) - 1;
-      val |= val << size;
-      size *= 2;
-    }
-  return val;
-}
-
 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
 
 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
@@ -10196,26 +10212,42 @@ static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
   };
 
 
-/* Return true if val is a valid bitmask immediate.  */
-
+/* Return true if val is a valid bitmask immediate for any mode.  */
 bool
 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
 {
-  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
+  if (mode == DImode)
+    return aarch64_bitmask_imm (val_in);
+
+  unsigned HOST_WIDE_INT val = val_in;
+
+  if (mode == SImode)
+    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
+
+  /* Replicate small immediates to fit 64 bits.  */
+  int size = GET_MODE_UNIT_PRECISION (mode);
+  val &= (HOST_WIDE_INT_1U << size) - 1;
+  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
+
+  return aarch64_bitmask_imm (val);
+}
+
+
+/* Return true if 64-bit val is a valid bitmask immediate.  */
+
+static bool
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
+{
+  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
   int bits;
 
   /* Check for a single sequence of one bits and return quickly if so.
      The special cases of all ones and all zeroes returns false.  */
-  val = aarch64_replicate_bitmask_imm (val_in, mode);
   tmp = val + (val & -val);
 
   if (tmp == (tmp & -tmp))
     return (val + 1) > 1;
 
-  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
-  if (mode == SImode)
-    val = (val << 32) | (val & 0xffffffff);
-
   /* Invert if the immediate doesn't start with a zero bit - this means we
      only need to search for sequences of one bits.  */
   if (val & 1)
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -0,0 +1,41 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x7efefefefefefeff;
+}
+
+long f2 (void)
+{
+  return 0x12345678aaaaaaaa;
+}
+
+long f3 (void)
+{
+  return 0x1234cccccccc5678;
+}
+
+long f4 (void)
+{
+  return 0x7777123456787777;
+}
+
+long f5 (void)
+{
+  return 0x5555555512345678;
+}
+
+long f6 (void)
+{
+  return 0x1234bbbb5678bbbb;
+}
+
+long f7 (void)
+{
+  return 0x4444123444445678;
+}
+
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-06 17:29   ` Wilco Dijkstra
@ 2022-10-07 13:17     ` Richard Sandiford
  2022-10-07 13:48       ` Wilco Dijkstra
  0 siblings, 1 reply; 11+ messages in thread
From: Richard Sandiford @ 2022-10-07 13:17 UTC (permalink / raw)
  To: Wilco Dijkstra via Gcc-patches; +Cc: Wilco Dijkstra

Wilco Dijkstra via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Hi Richard,
>
>> Did you consider handling the case where the movks aren't for
>> consecutive bitranges?  E.g. the patch handles:
>
>> but it looks like it would be fairly easy to extend it to:
>>
>>  0x1234cccc5678cccc
>
> Yes, with a more general search loop we can get that case too -
> it doesn't trigger much though. The code that checks for this is
> now refactored into a new function. Given there are now many
> more calls to aarch64_bitmask_imm, I added a streamlined internal
> entry point that assumes the input is 64-bit.

Sounds good, but could you put it before the mode version,
to avoid the forward declaration?

OK with that change, thanks.

Richard

> Cheers,
> Wilco
>
> [PATCH v2][AArch64] Improve immediate expansion [PR106583]
>
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
> efficiency of bitmask checks.  This reduces the number of 4-instruction
> immediates in SPECINT/FP by 10-15%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>
> 	PR target/106583
> 	* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
> 	Add support for a bitmask immediate with 2 MOVKs.
>         (aarch64_check_bitmask): New function after refactorization.
>         (aarch64_replicate_bitmask_imm): Remove function, merge into...
>         (aarch64_bitmask_imm): Simplify replication of small modes.
>         Split function into 64-bit only version for efficiency. 
>
> gcc/testsuite:
> 	PR target/106583
> 	* gcc.target/aarch64/pr106583.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>  					    aarch64_addr_query_type);
>  static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
> +static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
>  
>  /* The processor for which instructions should be scheduled.  */
>  enum aarch64_processor aarch64_tune = cortexa53;
> @@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
>  					     factor, nelts_per_vq);
>  }
>  
> +/* Return true if the immediate VAL can be a bitfield immediate
> +   by changing the given MASK bits in VAL to zeroes, ones or bits
> +   from the other half of VAL.  Return the new immediate in VAL2.  */
> +static inline bool
> +aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
> +		       unsigned HOST_WIDE_INT &val2,
> +		       unsigned HOST_WIDE_INT mask)
> +{
> +  val2 = val & ~mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val2 = val | mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val = val & ~mask;
> +  val2 = val | (((val >> 32) | (val << 32)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val2 = val | (((val >> 16) | (val << 48)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  return false;
> +}
> +
>  static int
>  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>  				scalar_int_mode mode)
> @@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>    one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>      ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>  
> -  if (zero_match != 2 && one_match != 2)
> +  if (zero_match < 2 && one_match < 2)
>      {
>        /* Try emitting a bitmask immediate with a movk replacing 16 bits.
>  	 For a 64-bit bitmask try whether changing 16 bits to all ones or
>  	 zeroes creates a valid bitmask.  To check any repeated bitmask,
>  	 try using 16 bits from the other 32-bit half of val.  */
>  
> -      for (i = 0; i < 64; i += 16, mask <<= 16)
> -	{
> -	  val2 = val & ~mask;
> -	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -	    break;
> -	  val2 = val | mask;
> -	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -	    break;
> -	  val2 = val2 & ~mask;
> -	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> -	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -	    break;
> -	}
> -      if (i != 64)
> -	{
> -	  if (generate)
> +      for (i = 0; i < 64; i += 16)
> +	if (aarch64_check_bitmask (val, val2, mask << i))
> +	  {
> +	    if (generate)
> +	      {
> +		emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +		emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +					   GEN_INT ((val >> i) & 0xffff)));
> +	      }
> +	    return 2;
> +	  }
> +    }
> +
> +  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
> +  if (zero_match + one_match == 0)
> +    {
> +      for (i = 0; i < 48; i += 16)
> +	for (int j = i + 16; j < 64; j += 16)
> +	  if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
>  	    {
> -	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> -	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> -					 GEN_INT ((val >> i) & 0xffff)));
> +	      if (generate)
> +		{
> +		  emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +		  emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +					     GEN_INT ((val >> i) & 0xffff)));
> +		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
> +					       GEN_INT ((val >> j) & 0xffff)));
> +		}
> +	      return 3;
>  	    }
> -	  return 2;
> -	}
>      }
>  
>    /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
> @@ -10168,22 +10200,6 @@ aarch64_movk_shift (const wide_int_ref &and_val,
>    return -1;
>  }
>  
> -/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
> -   64-bit (DImode) integer.  */
> -
> -static unsigned HOST_WIDE_INT
> -aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
> -{
> -  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
> -  while (size < 64)
> -    {
> -      val &= (HOST_WIDE_INT_1U << size) - 1;
> -      val |= val << size;
> -      size *= 2;
> -    }
> -  return val;
> -}
> -
>  /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
>  
>  static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
> @@ -10196,26 +10212,42 @@ static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
>    };
>  
>  
> -/* Return true if val is a valid bitmask immediate.  */
> -
> +/* Return true if val is a valid bitmask immediate for any mode.  */
>  bool
>  aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
>  {
> -  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
> +  if (mode == DImode)
> +    return aarch64_bitmask_imm (val_in);
> +
> +  unsigned HOST_WIDE_INT val = val_in;
> +
> +  if (mode == SImode)
> +    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
> +
> +  /* Replicate small immediates to fit 64 bits.  */
> +  int size = GET_MODE_UNIT_PRECISION (mode);
> +  val &= (HOST_WIDE_INT_1U << size) - 1;
> +  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
> +
> +  return aarch64_bitmask_imm (val);
> +}
> +
> +
> +/* Return true if 64-bit val is a valid bitmask immediate.  */
> +
> +static bool
> +aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
> +{
> +  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
>    int bits;
>  
>    /* Check for a single sequence of one bits and return quickly if so.
>       The special cases of all ones and all zeroes returns false.  */
> -  val = aarch64_replicate_bitmask_imm (val_in, mode);
>    tmp = val + (val & -val);
>  
>    if (tmp == (tmp & -tmp))
>      return (val + 1) > 1;
>  
> -  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
> -  if (mode == SImode)
> -    val = (val << 32) | (val & 0xffffffff);
> -
>    /* Invert if the immediate doesn't start with a zero bit - this means we
>       only need to search for sequences of one bits.  */
>    if (val & 1)
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> @@ -0,0 +1,41 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> +  return 0x7efefefefefefeff;
> +}
> +
> +long f2 (void)
> +{
> +  return 0x12345678aaaaaaaa;
> +}
> +
> +long f3 (void)
> +{
> +  return 0x1234cccccccc5678;
> +}
> +
> +long f4 (void)
> +{
> +  return 0x7777123456787777;
> +}
> +
> +long f5 (void)
> +{
> +  return 0x5555555512345678;
> +}
> +
> +long f6 (void)
> +{
> +  return 0x1234bbbb5678bbbb;
> +}
> +
> +long f7 (void)
> +{
> +  return 0x4444123444445678;
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-07 13:17     ` Richard Sandiford
@ 2022-10-07 13:48       ` Wilco Dijkstra
  2022-10-07 14:49         ` Richard Sandiford
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2022-10-07 13:48 UTC (permalink / raw)
  To: Richard Sandiford, Wilco Dijkstra via Gcc-patches

Hi Richard,

>> Yes, with a more general search loop we can get that case too -
>> it doesn't trigger much though. The code that checks for this is
>> now refactored into a new function. Given there are now many
>> more calls to aarch64_bitmask_imm, I added a streamlined internal
>> entry point that assumes the input is 64-bit.
>
> Sounds good, but could you put it before the mode version,
> to avoid the forward declaration?

I can swap them around but the forward declaration is still required as
aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-07 13:48       ` Wilco Dijkstra
@ 2022-10-07 14:49         ` Richard Sandiford
  2022-10-12 14:57           ` Wilco Dijkstra
  0 siblings, 1 reply; 11+ messages in thread
From: Richard Sandiford @ 2022-10-07 14:49 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Wilco Dijkstra via Gcc-patches

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>>> Yes, with a more general search loop we can get that case too -
>>> it doesn't trigger much though. The code that checks for this is
>>> now refactored into a new function. Given there are now many
>>> more calls to aarch64_bitmask_imm, I added a streamlined internal
>>> entry point that assumes the input is 64-bit.
>>
>> Sounds good, but could you put it before the mode version,
>> to avoid the forward declaration?
>
> I can swap them around but the forward declaration is still required as
> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.

OK, how about moving them both above aarch64_check_bitmask?

Richard

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-07 14:49         ` Richard Sandiford
@ 2022-10-12 14:57           ` Wilco Dijkstra
  2022-10-19 15:33             ` Wilco Dijkstra
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2022-10-12 14:57 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Wilco Dijkstra via Gcc-patches

Hi Richard,

>>> Sounds good, but could you put it before the mode version,
>>> to avoid the forward declaration?
>>
>> I can swap them around but the forward declaration is still required as
>> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.
>
> OK, how about moving them both above aarch64_check_bitmask?

Sure I've moved them as well as all related helper functions - it makes the diff
quite large but they are all together now which makes sense. I also refactored
aarch64_mov_imm to handle the case of a 64-bit immediate being generated
by a 32-bit MOVZ/MOVN - this simplifies aarch64_internal_move_immediate
and movdi patterns even further.

Cheers,
Wilco

v3: move immediate code together and avoid forward declarations,
further cleanups and simplifications.

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
efficiency of bitmask checks and move immediate. Move various immediate
handling functions together to avoid forward declarations.
Include 32-bit MOVZ/N as valid 64-bit immediates. Add new constraint so
the movdi pattern only needs a single alternative for move immediate.

This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.

Passes bootstrap & regress, OK for commit?

gcc/ChangeLog:

	PR target/106583
	* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
	Add support for a bitmask immediate with 2 MOVKs.
        (aarch64_check_bitmask): New function after refactorization.
        (aarch64_replicate_bitmask_imm): Remove function, merge into...
        (aarch64_bitmask_imm): Simplify replication of small modes.
        Split function into 64-bit only version for efficiency.
        (aarch64_zeroextended_move_imm): New function.
        (aarch64_move_imm): Refactor code.
        (aarch64_uimm12_shift): Move near other immediate functions.
        (aarch64_clamp_to_uimm12_shift): Likewise.
        (aarch64_movk_shift): Likewise.
        (aarch64_replicate_bitmask_imm): Likewise.
        (aarch64_and_split_imm1): Likewise.
        (aarch64_and_split_imm2): Likewise.
        (aarch64_and_bitmask_imm): Likewise.
        (aarch64_movw_imm): Remove.
        * config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
        constraints into single 'O'.
        (mov<mode>_aarch64): Likewise.
        * config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
        (aarch64_bitmask_imm): Likewise.
        (aarch64_uimm12_shift): Likewise.
        (aarch64_zeroextended_move_imm): New prototype.
        * config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
        limit 'N' to 64-bit only moves.

gcc/testsuite:
	PR target/106583
	* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3e4005c9f4ff1f999f1811c6fb0b2252878dc4ae..b82f9ba7c2bb4cffa16abbf45f87061f72015083 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
@@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
 const char *aarch64_output_casesi (rtx *);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4de55beb067ea8f0be0a90060a785c94bdee708b..785ec07692981d423582051ac0897e5dbc3a001f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -305,7 +305,6 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
-static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -5502,6 +5501,142 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
 					     factor, nelts_per_vq);
 }
 
+/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
+
+static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
+  {
+    0x0000000100000001ull,
+    0x0001000100010001ull,
+    0x0101010101010101ull,
+    0x1111111111111111ull,
+    0x5555555555555555ull,
+  };
+
+
+/* Return true if 64-bit VAL is a valid bitmask immediate.  */
+static bool
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
+{
+  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
+  int bits;
+
+  /* Check for a single sequence of one bits and return quickly if so.
+     The special cases of all ones and all zeroes returns false.  */
+  tmp = val + (val & -val);
+
+  if (tmp == (tmp & -tmp))
+    return (val + 1) > 1;
+
+  /* Invert if the immediate doesn't start with a zero bit - this means we
+     only need to search for sequences of one bits.  */
+  if (val & 1)
+    val = ~val;
+
+  /* Find the first set bit and set tmp to val with the first sequence of one
+     bits removed.  Return success if there is a single sequence of ones.  */
+  first_one = val & -val;
+  tmp = val & (val + first_one);
+
+  if (tmp == 0)
+    return true;
+
+  /* Find the next set bit and compute the difference in bit position.  */
+  next_one = tmp & -tmp;
+  bits = clz_hwi (first_one) - clz_hwi (next_one);
+  mask = val ^ tmp;
+
+  /* Check the bit position difference is a power of 2, and that the first
+     sequence of one bits fits within 'bits' bits.  */
+  if ((mask >> bits) != 0 || bits != (bits & -bits))
+    return false;
+
+  /* Check the sequence of one bits is repeated 64/bits times.  */
+  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
+}
+
+
+/* Return true if VAL is a valid bitmask immediate for any mode.  */
+bool
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
+{
+  if (mode == DImode)
+    return aarch64_bitmask_imm (val);
+
+  if (mode == SImode)
+    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
+
+  /* Replicate small immediates to fit 64 bits.  */
+  int size = GET_MODE_UNIT_PRECISION (mode);
+  val &= (HOST_WIDE_INT_1U << size) - 1;
+  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
+
+  return aarch64_bitmask_imm (val);
+}
+
+/* Return true if the immediate VAL can be a bitfield immediate
+   by changing the given MASK bits in VAL to zeroes, ones or bits
+   from the other half of VAL.  Return the new immediate in VAL2.  */
+static inline bool
+aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
+		       unsigned HOST_WIDE_INT &val2,
+		       unsigned HOST_WIDE_INT mask)
+{
+  val2 = val & ~mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val = val & ~mask;
+  val2 = val | (((val >> 32) | (val << 32)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | (((val >> 16) | (val << 48)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  return false;
+}
+
+/* Return true if immediate VAL can only be created by using a 32-bit
+   zero-extended move immediate, not by a 64-bit move.  */
+bool
+aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT val)
+{
+  if ((val >> 16) == 0 || (val >> 32) != 0 || (val & 0xffff) == 0)
+    return false;
+  return !aarch64_bitmask_imm (val);
+}
+
+/* Return true if VAL is an immediate that can be created by a single
+   MOV instruction.  */
+bool
+aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
+{
+  unsigned HOST_WIDE_INT val2;
+
+  if (val < 65536)
+    return true;
+
+  val2 = val ^ ((HOST_WIDE_INT) val >> 63);
+  if ((val2 >> (__builtin_ctzll (val2) & 48)) < 65536)
+    return true;
+
+  /* Special case 0xyyyyffffffffffff. */
+  if (((val2 + 1) << 16) == 0)
+    return true;
+
+  /* Special case immediates 0xffffyyyy and 0xyyyyffff.  */
+  val2 = (mode == DImode) ? val : val2;
+  if (((val2 + 1) & ~(unsigned HOST_WIDE_INT) 0xffff0000) == 0
+      || (val2 >> 16) == 0xffff)
+    return true;
+
+  if (mode == SImode || (val >> 32) == 0)
+    val = (val & 0xffffffff) | (val << 32);
+  return aarch64_bitmask_imm (val);
+}
+
+
 static int
 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 				scalar_int_mode mode)
@@ -5520,31 +5655,6 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
       return 1;
     }
 
-  /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
-     (with XXXX non-zero). In that case check to see if the move can be done in
-     a smaller mode.  */
-  val2 = val & 0xffffffff;
-  if (mode == DImode
-      && aarch64_move_imm (val2, SImode)
-      && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
-    {
-      if (generate)
-	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
-
-      /* Check if we have to emit a second instruction by checking to see
-         if any of the upper 32 bits of the original DI mode value is set.  */
-      if (val == val2)
-	return 1;
-
-      i = (val >> 48) ? 48 : 32;
-
-      if (generate)
-	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
-				    GEN_INT ((val >> i) & 0xffff)));
-
-      return 2;
-    }
-
   if ((val >> 32) == 0 || mode == SImode)
     {
       if (generate)
@@ -5568,26 +5678,20 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  /* Try a bitmask immediate and a movk to generate the immediate
+     in 2 instructions.  */
+  if (zero_match < 2 && one_match < 2)
     {
-      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
-	 For a 64-bit bitmask try whether changing 16 bits to all ones or
-	 zeroes creates a valid bitmask.  To check any repeated bitmask,
-	 try using 16 bits from the other 32-bit half of val.  */
-
-      for (i = 0; i < 64; i += 16, mask <<= 16)
+      for (i = 0; i < 64; i += 16)
 	{
-	  val2 = val & ~mask;
-	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
-	    break;
-	  val2 = val | mask;
-	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	  if (aarch64_check_bitmask (val, val2, mask << i))
 	    break;
-	  val2 = val2 & ~mask;
-	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
-	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+
+	  val2 = val & ~(mask << i);
+	  if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
 	    break;
 	}
+
       if (i != 64)
 	{
 	  if (generate)
@@ -5600,6 +5704,25 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 	}
     }
 
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+    {
+      for (i = 0; i < 48; i += 16)
+	for (int j = i + 16; j < 64; j += 16)
+	  if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
+	    {
+	      if (generate)
+		{
+		  emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+		  emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+					     GEN_INT ((val >> i) & 0xffff)));
+		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
+					       GEN_INT ((val >> j) & 0xffff)));
+		}
+	      return 3;
+	    }
+    }
+
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
      otherwise skip zero bits.  */
@@ -5643,6 +5766,95 @@ aarch64_mov128_immediate (rtx imm)
 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
 }
 
+/* Return true if val can be encoded as a 12-bit unsigned immediate with
+   a left shift of 0 or 12 bits.  */
+bool
+aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
+{
+  return val < 4096 || (val & 0xfff000) == val;
+}
+
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert (val < 0x1000000);
+
+  if (val < 4096)
+    return val;
+
+  return val & 0xfff000;
+}
+
+/* Test whether:
+
+     X = (X & AND_VAL) | IOR_VAL;
+
+   can be implemented using:
+
+     MOVK X, #(IOR_VAL >> shift), LSL #shift
+
+   Return the shift if so, otherwise return -1.  */
+int
+aarch64_movk_shift (const wide_int_ref &and_val,
+		    const wide_int_ref &ior_val)
+{
+  unsigned int precision = and_val.get_precision ();
+  unsigned HOST_WIDE_INT mask = 0xffff;
+  for (unsigned int shift = 0; shift < precision; shift += 16)
+    {
+      if (and_val == ~mask && (ior_val & mask) == ior_val)
+	return shift;
+      mask <<= 16;
+    }
+  return -1;
+}
+
+/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
+   Assumed precondition: VAL_IN Is not zero.  */
+
+unsigned HOST_WIDE_INT
+aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
+{
+  int lowest_bit_set = ctz_hwi (val_in);
+  int highest_bit_set = floor_log2 (val_in);
+  gcc_assert (val_in != 0);
+
+  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
+	  (HOST_WIDE_INT_1U << lowest_bit_set));
+}
+
+/* Create constant where bits outside of lowest bit set to highest bit set
+   are set to 1.  */
+
+unsigned HOST_WIDE_INT
+aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
+{
+  return val_in | ~aarch64_and_split_imm1 (val_in);
+}
+
+/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
+
+bool
+aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
+{
+  scalar_int_mode int_mode;
+  if (!is_a <scalar_int_mode> (mode, &int_mode))
+    return false;
+
+  if (aarch64_bitmask_imm (val_in, int_mode))
+    return false;
+
+  if (aarch64_move_imm (val_in, int_mode))
+    return false;
+
+  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
+
+  return aarch64_bitmask_imm (imm2, int_mode);
+}
 
 /* Return the number of temporary registers that aarch64_add_offset_1
    would need to add OFFSET to a register.  */
@@ -10098,208 +10310,6 @@ aarch64_tls_referenced_p (rtx x)
   return false;
 }
 
-
-/* Return true if val can be encoded as a 12-bit unsigned immediate with
-   a left shift of 0 or 12 bits.  */
-bool
-aarch64_uimm12_shift (HOST_WIDE_INT val)
-{
-  return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
-	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
-	  );
-}
-
-/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
-   that can be created with a left shift of 0 or 12.  */
-static HOST_WIDE_INT
-aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
-{
-  /* Check to see if the value fits in 24 bits, as that is the maximum we can
-     handle correctly.  */
-  gcc_assert ((val & 0xffffff) == val);
-
-  if (((val & 0xfff) << 0) == val)
-    return val;
-
-  return val & (0xfff << 12);
-}
-
-/* Return true if val is an immediate that can be loaded into a
-   register by a MOVZ instruction.  */
-static bool
-aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
-{
-  if (GET_MODE_SIZE (mode) > 4)
-    {
-      if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
-	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
-	return 1;
-    }
-  else
-    {
-      /* Ignore sign extension.  */
-      val &= (HOST_WIDE_INT) 0xffffffff;
-    }
-  return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
-	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
-}
-
-/* Test whether:
-
-     X = (X & AND_VAL) | IOR_VAL;
-
-   can be implemented using:
-
-     MOVK X, #(IOR_VAL >> shift), LSL #shift
-
-   Return the shift if so, otherwise return -1.  */
-int
-aarch64_movk_shift (const wide_int_ref &and_val,
-		    const wide_int_ref &ior_val)
-{
-  unsigned int precision = and_val.get_precision ();
-  unsigned HOST_WIDE_INT mask = 0xffff;
-  for (unsigned int shift = 0; shift < precision; shift += 16)
-    {
-      if (and_val == ~mask && (ior_val & mask) == ior_val)
-	return shift;
-      mask <<= 16;
-    }
-  return -1;
-}
-
-/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
-   64-bit (DImode) integer.  */
-
-static unsigned HOST_WIDE_INT
-aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
-{
-  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
-  while (size < 64)
-    {
-      val &= (HOST_WIDE_INT_1U << size) - 1;
-      val |= val << size;
-      size *= 2;
-    }
-  return val;
-}
-
-/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
-
-static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
-  {
-    0x0000000100000001ull,
-    0x0001000100010001ull,
-    0x0101010101010101ull,
-    0x1111111111111111ull,
-    0x5555555555555555ull,
-  };
-
-
-/* Return true if val is a valid bitmask immediate.  */
-
-bool
-aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
-{
-  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
-  int bits;
-
-  /* Check for a single sequence of one bits and return quickly if so.
-     The special cases of all ones and all zeroes returns false.  */
-  val = aarch64_replicate_bitmask_imm (val_in, mode);
-  tmp = val + (val & -val);
-
-  if (tmp == (tmp & -tmp))
-    return (val + 1) > 1;
-
-  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
-  if (mode == SImode)
-    val = (val << 32) | (val & 0xffffffff);
-
-  /* Invert if the immediate doesn't start with a zero bit - this means we
-     only need to search for sequences of one bits.  */
-  if (val & 1)
-    val = ~val;
-
-  /* Find the first set bit and set tmp to val with the first sequence of one
-     bits removed.  Return success if there is a single sequence of ones.  */
-  first_one = val & -val;
-  tmp = val & (val + first_one);
-
-  if (tmp == 0)
-    return true;
-
-  /* Find the next set bit and compute the difference in bit position.  */
-  next_one = tmp & -tmp;
-  bits = clz_hwi (first_one) - clz_hwi (next_one);
-  mask = val ^ tmp;
-
-  /* Check the bit position difference is a power of 2, and that the first
-     sequence of one bits fits within 'bits' bits.  */
-  if ((mask >> bits) != 0 || bits != (bits & -bits))
-    return false;
-
-  /* Check the sequence of one bits is repeated 64/bits times.  */
-  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
-}
-
-/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.  
-   Assumed precondition: VAL_IN Is not zero.  */
-
-unsigned HOST_WIDE_INT
-aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
-{
-  int lowest_bit_set = ctz_hwi (val_in);
-  int highest_bit_set = floor_log2 (val_in);
-  gcc_assert (val_in != 0);
-
-  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
-	  (HOST_WIDE_INT_1U << lowest_bit_set));
-}
-
-/* Create constant where bits outside of lowest bit set to highest bit set
-   are set to 1.  */
-
-unsigned HOST_WIDE_INT
-aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
-{
-  return val_in | ~aarch64_and_split_imm1 (val_in);
-}
-
-/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
-
-bool
-aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
-{
-  scalar_int_mode int_mode;
-  if (!is_a <scalar_int_mode> (mode, &int_mode))
-    return false;
-
-  if (aarch64_bitmask_imm (val_in, int_mode))
-    return false;
-
-  if (aarch64_move_imm (val_in, int_mode))
-    return false;
-
-  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
-
-  return aarch64_bitmask_imm (imm2, int_mode);
-}
-
-/* Return true if val is an immediate that can be loaded into a
-   register in a single instruction.  */
-bool
-aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
-{
-  scalar_int_mode int_mode;
-  if (!is_a <scalar_int_mode> (mode, &int_mode))
-    return false;
-
-  if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
-    return 1;
-  return aarch64_bitmask_imm (val, int_mode);
-}
-
 static bool
 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 0a7633e5dd6d45282edd7a1088c14b555bc09b40..23ceca48543d23b85beea1f0bf98ef83051d80b6 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1309,16 +1309,15 @@ (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,O,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
    mov\\t%x0, %x1
    mov\\t%0, %x1
    mov\\t%x0, %1
-   mov\\t%x0, %1
-   mov\\t%w0, %1
+   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";
    #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
@@ -1340,11 +1339,11 @@ (define_insn_and_split "*movdi_aarch64"
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,
 		     load_8,load_8,store_8,store_8,load_8,adr,adr,f_mcr,f_mrc,
 		     fmov,neon_move")
-   (set_attr "arch"   "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
-   (set_attr "length" "4,4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
+   (set_attr "arch"   "*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
+   (set_attr "length" "4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
 )
 
 (define_insn "insv_imm<mode>"
@@ -1508,7 +1507,7 @@ (define_insn "*mov<mode>_aarch64"
 
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:DFD 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
-	(match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
+	(match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,O"))]
   "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
     || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
@@ -1523,7 +1522,7 @@ (define_insn "*mov<mode>_aarch64"
    ldr\\t%x0, %1
    str\\t%x1, %0
    mov\\t%x0, %x1
-   mov\\t%x0, %1"
+   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";"
   [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
 		     f_loadd,f_stored,load_8,store_8,mov_reg,\
 		     fconstd")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..e91c7eab0b3674ca34ac2f790c38fcd27986c35f 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -106,6 +106,12 @@ (define_constraint "M"
 
 (define_constraint "N"
  "A constant that can be used with a 64-bit MOV immediate operation."
+ (and (match_code "const_int")
+      (match_test "aarch64_move_imm (ival, DImode)")
+      (match_test "!aarch64_zeroextended_move_imm (ival)")))
+
+(define_constraint "O"
+ "A constant that can be used with a 32 or 64-bit MOV immediate operation."
  (and (match_code "const_int")
       (match_test "aarch64_move_imm (ival, DImode)")))
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -0,0 +1,41 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x7efefefefefefeff;
+}
+
+long f2 (void)
+{
+  return 0x12345678aaaaaaaa;
+}
+
+long f3 (void)
+{
+  return 0x1234cccccccc5678;
+}
+
+long f4 (void)
+{
+  return 0x7777123456787777;
+}
+
+long f5 (void)
+{
+  return 0x5555555512345678;
+}
+
+long f6 (void)
+{
+  return 0x1234bbbb5678bbbb;
+}
+
+long f7 (void)
+{
+  return 0x4444123444445678;
+}
+
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-12 14:57           ` Wilco Dijkstra
@ 2022-10-19 15:33             ` Wilco Dijkstra
  2022-10-20  9:31               ` Richard Sandiford
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2022-10-19 15:33 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Wilco Dijkstra via Gcc-patches

ping



Hi Richard,

>>> Sounds good, but could you put it before the mode version,
>>> to avoid the forward declaration?
>>
>> I can swap them around but the forward declaration is still required as
>> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.
>
> OK, how about moving them both above aarch64_check_bitmask?

Sure I've moved them as well as all related helper functions - it makes the diff
quite large but they are all together now which makes sense. I also refactored
aarch64_mov_imm to handle the case of a 64-bit immediate being generated
by a 32-bit MOVZ/MOVN - this simplifies aarch64_internal_move_immediate
and movdi patterns even further.

Cheers,
Wilco

v3: move immediate code together and avoid forward declarations,
further cleanups and simplifications.

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
efficiency of bitmask checks and move immediate. Move various immediate
handling functions together to avoid forward declarations.
Include 32-bit MOVZ/N as valid 64-bit immediates. Add new constraint so
the movdi pattern only needs a single alternative for move immediate.

This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.

Passes bootstrap & regress, OK for commit?

gcc/ChangeLog:

        PR target/106583
        * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
        Add support for a bitmask immediate with 2 MOVKs.
        (aarch64_check_bitmask): New function after refactorization.
        (aarch64_replicate_bitmask_imm): Remove function, merge into...
        (aarch64_bitmask_imm): Simplify replication of small modes.
        Split function into 64-bit only version for efficiency.
        (aarch64_zeroextended_move_imm): New function.
        (aarch64_move_imm): Refactor code.
        (aarch64_uimm12_shift): Move near other immediate functions.
        (aarch64_clamp_to_uimm12_shift): Likewise.
        (aarch64_movk_shift): Likewise.
        (aarch64_replicate_bitmask_imm): Likewise.
        (aarch64_and_split_imm1): Likewise.
        (aarch64_and_split_imm2): Likewise.
        (aarch64_and_bitmask_imm): Likewise.
        (aarch64_movw_imm): Remove.
        * config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
        constraints into single 'O'.
        (mov<mode>_aarch64): Likewise.
        * config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
        (aarch64_bitmask_imm): Likewise.
        (aarch64_uimm12_shift): Likewise.
        (aarch64_zeroextended_move_imm): New prototype.
        * config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
        limit 'N' to 64-bit only moves.

gcc/testsuite:
        PR target/106583
        * gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3e4005c9f4ff1f999f1811c6fb0b2252878dc4ae..b82f9ba7c2bb4cffa16abbf45f87061f72015083 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
@@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
                                         unsigned HOST_WIDE_INT,
                                         unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
 const char *aarch64_output_casesi (rtx *);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4de55beb067ea8f0be0a90060a785c94bdee708b..785ec07692981d423582051ac0897e5dbc3a001f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -305,7 +305,6 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                             aarch64_addr_query_type);
-static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -5502,6 +5501,142 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
                                              factor, nelts_per_vq);
 }
 
+/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
+
+static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
+  {
+    0x0000000100000001ull,
+    0x0001000100010001ull,
+    0x0101010101010101ull,
+    0x1111111111111111ull,
+    0x5555555555555555ull,
+  };
+
+
+/* Return true if 64-bit VAL is a valid bitmask immediate.  */
+static bool
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
+{
+  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
+  int bits;
+
+  /* Check for a single sequence of one bits and return quickly if so.
+     The special cases of all ones and all zeroes returns false.  */
+  tmp = val + (val & -val);
+
+  if (tmp == (tmp & -tmp))
+    return (val + 1) > 1;
+
+  /* Invert if the immediate doesn't start with a zero bit - this means we
+     only need to search for sequences of one bits.  */
+  if (val & 1)
+    val = ~val;
+
+  /* Find the first set bit and set tmp to val with the first sequence of one
+     bits removed.  Return success if there is a single sequence of ones.  */
+  first_one = val & -val;
+  tmp = val & (val + first_one);
+
+  if (tmp == 0)
+    return true;
+
+  /* Find the next set bit and compute the difference in bit position.  */
+  next_one = tmp & -tmp;
+  bits = clz_hwi (first_one) - clz_hwi (next_one);
+  mask = val ^ tmp;
+
+  /* Check the bit position difference is a power of 2, and that the first
+     sequence of one bits fits within 'bits' bits.  */
+  if ((mask >> bits) != 0 || bits != (bits & -bits))
+    return false;
+
+  /* Check the sequence of one bits is repeated 64/bits times.  */
+  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
+}
+
+
+/* Return true if VAL is a valid bitmask immediate for any mode.  */
+bool
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
+{
+  if (mode == DImode)
+    return aarch64_bitmask_imm (val);
+
+  if (mode == SImode)
+    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
+
+  /* Replicate small immediates to fit 64 bits.  */
+  int size = GET_MODE_UNIT_PRECISION (mode);
+  val &= (HOST_WIDE_INT_1U << size) - 1;
+  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
+
+  return aarch64_bitmask_imm (val);
+}
+
+/* Return true if the immediate VAL can be a bitfield immediate
+   by changing the given MASK bits in VAL to zeroes, ones or bits
+   from the other half of VAL.  Return the new immediate in VAL2.  */
+static inline bool
+aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
+                      unsigned HOST_WIDE_INT &val2,
+                      unsigned HOST_WIDE_INT mask)
+{
+  val2 = val & ~mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val = val & ~mask;
+  val2 = val | (((val >> 32) | (val << 32)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | (((val >> 16) | (val << 48)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  return false;
+}
+
+/* Return true if immediate VAL can only be created by using a 32-bit
+   zero-extended move immediate, not by a 64-bit move.  */
+bool
+aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT val)
+{
+  if ((val >> 16) == 0 || (val >> 32) != 0 || (val & 0xffff) == 0)
+    return false;
+  return !aarch64_bitmask_imm (val);
+}
+
+/* Return true if VAL is an immediate that can be created by a single
+   MOV instruction.  */
+bool
+aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
+{
+  unsigned HOST_WIDE_INT val2;
+
+  if (val < 65536)
+    return true;
+
+  val2 = val ^ ((HOST_WIDE_INT) val >> 63);
+  if ((val2 >> (__builtin_ctzll (val2) & 48)) < 65536)
+    return true;
+
+  /* Special case 0xyyyyffffffffffff. */
+  if (((val2 + 1) << 16) == 0)
+    return true;
+
+  /* Special case immediates 0xffffyyyy and 0xyyyyffff.  */
+  val2 = (mode == DImode) ? val : val2;
+  if (((val2 + 1) & ~(unsigned HOST_WIDE_INT) 0xffff0000) == 0
+      || (val2 >> 16) == 0xffff)
+    return true;
+
+  if (mode == SImode || (val >> 32) == 0)
+    val = (val & 0xffffffff) | (val << 32);
+  return aarch64_bitmask_imm (val);
+}
+
+
 static int
 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
                                 scalar_int_mode mode)
@@ -5520,31 +5655,6 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
       return 1;
     }
 
-  /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
-     (with XXXX non-zero). In that case check to see if the move can be done in
-     a smaller mode.  */
-  val2 = val & 0xffffffff;
-  if (mode == DImode
-      && aarch64_move_imm (val2, SImode)
-      && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
-    {
-      if (generate)
-       emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
-
-      /* Check if we have to emit a second instruction by checking to see
-         if any of the upper 32 bits of the original DI mode value is set.  */
-      if (val == val2)
-       return 1;
-
-      i = (val >> 48) ? 48 : 32;
-
-      if (generate)
-        emit_insn (gen_insv_immdi (dest, GEN_INT (i),
-                                   GEN_INT ((val >> i) & 0xffff)));
-
-      return 2;
-    }
-
   if ((val >> 32) == 0 || mode == SImode)
     {
       if (generate)
@@ -5568,26 +5678,20 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  /* Try a bitmask immediate and a movk to generate the immediate
+     in 2 instructions.  */
+  if (zero_match < 2 && one_match < 2)
     {
-      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
-        For a 64-bit bitmask try whether changing 16 bits to all ones or
-        zeroes creates a valid bitmask.  To check any repeated bitmask,
-        try using 16 bits from the other 32-bit half of val.  */
-
-      for (i = 0; i < 64; i += 16, mask <<= 16)
+      for (i = 0; i < 64; i += 16)
         {
-         val2 = val & ~mask;
-         if (val2 != val && aarch64_bitmask_imm (val2, mode))
-           break;
-         val2 = val | mask;
-         if (val2 != val && aarch64_bitmask_imm (val2, mode))
+         if (aarch64_check_bitmask (val, val2, mask << i))
             break;
-         val2 = val2 & ~mask;
-         val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
-         if (val2 != val && aarch64_bitmask_imm (val2, mode))
+
+         val2 = val & ~(mask << i);
+         if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
             break;
         }
+
       if (i != 64)
         {
           if (generate)
@@ -5600,6 +5704,25 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
         }
     }
 
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+    {
+      for (i = 0; i < 48; i += 16)
+       for (int j = i + 16; j < 64; j += 16)
+         if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
+           {
+             if (generate)
+               {
+                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+                                            GEN_INT ((val >> i) & 0xffff)));
+                 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
+                                              GEN_INT ((val >> j) & 0xffff)));
+               }
+             return 3;
+           }
+    }
+
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
      otherwise skip zero bits.  */
@@ -5643,6 +5766,95 @@ aarch64_mov128_immediate (rtx imm)
          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
 }
 
+/* Return true if val can be encoded as a 12-bit unsigned immediate with
+   a left shift of 0 or 12 bits.  */
+bool
+aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
+{
+  return val < 4096 || (val & 0xfff000) == val;
+}
+
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert (val < 0x1000000);
+
+  if (val < 4096)
+    return val;
+
+  return val & 0xfff000;
+}
+
+/* Test whether:
+
+     X = (X & AND_VAL) | IOR_VAL;
+
+   can be implemented using:
+
+     MOVK X, #(IOR_VAL >> shift), LSL #shift
+
+   Return the shift if so, otherwise return -1.  */
+int
+aarch64_movk_shift (const wide_int_ref &and_val,
+                   const wide_int_ref &ior_val)
+{
+  unsigned int precision = and_val.get_precision ();
+  unsigned HOST_WIDE_INT mask = 0xffff;
+  for (unsigned int shift = 0; shift < precision; shift += 16)
+    {
+      if (and_val == ~mask && (ior_val & mask) == ior_val)
+       return shift;
+      mask <<= 16;
+    }
+  return -1;
+}
+
+/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
+   Assumed precondition: VAL_IN Is not zero.  */
+
+unsigned HOST_WIDE_INT
+aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
+{
+  int lowest_bit_set = ctz_hwi (val_in);
+  int highest_bit_set = floor_log2 (val_in);
+  gcc_assert (val_in != 0);
+
+  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
+         (HOST_WIDE_INT_1U << lowest_bit_set));
+}
+
+/* Create constant where bits outside of lowest bit set to highest bit set
+   are set to 1.  */
+
+unsigned HOST_WIDE_INT
+aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
+{
+  return val_in | ~aarch64_and_split_imm1 (val_in);
+}
+
+/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
+
+bool
+aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
+{
+  scalar_int_mode int_mode;
+  if (!is_a <scalar_int_mode> (mode, &int_mode))
+    return false;
+
+  if (aarch64_bitmask_imm (val_in, int_mode))
+    return false;
+
+  if (aarch64_move_imm (val_in, int_mode))
+    return false;
+
+  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
+
+  return aarch64_bitmask_imm (imm2, int_mode);
+}
 
 /* Return the number of temporary registers that aarch64_add_offset_1
    would need to add OFFSET to a register.  */
@@ -10098,208 +10310,6 @@ aarch64_tls_referenced_p (rtx x)
   return false;
 }
 
-
-/* Return true if val can be encoded as a 12-bit unsigned immediate with
-   a left shift of 0 or 12 bits.  */
-bool
-aarch64_uimm12_shift (HOST_WIDE_INT val)
-{
-  return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
-         || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
-         );
-}
-
-/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
-   that can be created with a left shift of 0 or 12.  */
-static HOST_WIDE_INT
-aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
-{
-  /* Check to see if the value fits in 24 bits, as that is the maximum we can
-     handle correctly.  */
-  gcc_assert ((val & 0xffffff) == val);
-
-  if (((val & 0xfff) << 0) == val)
-    return val;
-
-  return val & (0xfff << 12);
-}
-
-/* Return true if val is an immediate that can be loaded into a
-   register by a MOVZ instruction.  */
-static bool
-aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
-{
-  if (GET_MODE_SIZE (mode) > 4)
-    {
-      if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
-         || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
-       return 1;
-    }
-  else
-    {
-      /* Ignore sign extension.  */
-      val &= (HOST_WIDE_INT) 0xffffffff;
-    }
-  return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
-         || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
-}
-
-/* Test whether:
-
-     X = (X & AND_VAL) | IOR_VAL;
-
-   can be implemented using:
-
-     MOVK X, #(IOR_VAL >> shift), LSL #shift
-
-   Return the shift if so, otherwise return -1.  */
-int
-aarch64_movk_shift (const wide_int_ref &and_val,
-                   const wide_int_ref &ior_val)
-{
-  unsigned int precision = and_val.get_precision ();
-  unsigned HOST_WIDE_INT mask = 0xffff;
-  for (unsigned int shift = 0; shift < precision; shift += 16)
-    {
-      if (and_val == ~mask && (ior_val & mask) == ior_val)
-       return shift;
-      mask <<= 16;
-    }
-  return -1;
-}
-
-/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
-   64-bit (DImode) integer.  */
-
-static unsigned HOST_WIDE_INT
-aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
-{
-  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
-  while (size < 64)
-    {
-      val &= (HOST_WIDE_INT_1U << size) - 1;
-      val |= val << size;
-      size *= 2;
-    }
-  return val;
-}
-
-/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
-
-static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
-  {
-    0x0000000100000001ull,
-    0x0001000100010001ull,
-    0x0101010101010101ull,
-    0x1111111111111111ull,
-    0x5555555555555555ull,
-  };
-
-
-/* Return true if val is a valid bitmask immediate.  */
-
-bool
-aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
-{
-  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
-  int bits;
-
-  /* Check for a single sequence of one bits and return quickly if so.
-     The special cases of all ones and all zeroes returns false.  */
-  val = aarch64_replicate_bitmask_imm (val_in, mode);
-  tmp = val + (val & -val);
-
-  if (tmp == (tmp & -tmp))
-    return (val + 1) > 1;
-
-  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
-  if (mode == SImode)
-    val = (val << 32) | (val & 0xffffffff);
-
-  /* Invert if the immediate doesn't start with a zero bit - this means we
-     only need to search for sequences of one bits.  */
-  if (val & 1)
-    val = ~val;
-
-  /* Find the first set bit and set tmp to val with the first sequence of one
-     bits removed.  Return success if there is a single sequence of ones.  */
-  first_one = val & -val;
-  tmp = val & (val + first_one);
-
-  if (tmp == 0)
-    return true;
-
-  /* Find the next set bit and compute the difference in bit position.  */
-  next_one = tmp & -tmp;
-  bits = clz_hwi (first_one) - clz_hwi (next_one);
-  mask = val ^ tmp;
-
-  /* Check the bit position difference is a power of 2, and that the first
-     sequence of one bits fits within 'bits' bits.  */
-  if ((mask >> bits) != 0 || bits != (bits & -bits))
-    return false;
-
-  /* Check the sequence of one bits is repeated 64/bits times.  */
-  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
-}
-
-/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.  
-   Assumed precondition: VAL_IN Is not zero.  */
-
-unsigned HOST_WIDE_INT
-aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
-{
-  int lowest_bit_set = ctz_hwi (val_in);
-  int highest_bit_set = floor_log2 (val_in);
-  gcc_assert (val_in != 0);
-
-  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
-         (HOST_WIDE_INT_1U << lowest_bit_set));
-}
-
-/* Create constant where bits outside of lowest bit set to highest bit set
-   are set to 1.  */
-
-unsigned HOST_WIDE_INT
-aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
-{
-  return val_in | ~aarch64_and_split_imm1 (val_in);
-}
-
-/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
-
-bool
-aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
-{
-  scalar_int_mode int_mode;
-  if (!is_a <scalar_int_mode> (mode, &int_mode))
-    return false;
-
-  if (aarch64_bitmask_imm (val_in, int_mode))
-    return false;
-
-  if (aarch64_move_imm (val_in, int_mode))
-    return false;
-
-  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
-
-  return aarch64_bitmask_imm (imm2, int_mode);
-}
-
-/* Return true if val is an immediate that can be loaded into a
-   register in a single instruction.  */
-bool
-aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
-{
-  scalar_int_mode int_mode;
-  if (!is_a <scalar_int_mode> (mode, &int_mode))
-    return false;
-
-  if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
-    return 1;
-  return aarch64_bitmask_imm (val, int_mode);
-}
-
 static bool
 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 0a7633e5dd6d45282edd7a1088c14b555bc09b40..23ceca48543d23b85beea1f0bf98ef83051d80b6 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1309,16 +1309,15 @@ (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
-       (match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
+       (match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,O,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
    mov\\t%x0, %x1
    mov\\t%0, %x1
    mov\\t%x0, %1
-   mov\\t%x0, %1
-   mov\\t%w0, %1
+   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";
    #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
@@ -1340,11 +1339,11 @@ (define_insn_and_split "*movdi_aarch64"
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,
                      load_8,load_8,store_8,store_8,load_8,adr,adr,f_mcr,f_mrc,
                      fmov,neon_move")
-   (set_attr "arch"   "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
-   (set_attr "length" "4,4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
+   (set_attr "arch"   "*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
+   (set_attr "length" "4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
 )
 
 (define_insn "insv_imm<mode>"
@@ -1508,7 +1507,7 @@ (define_insn "*mov<mode>_aarch64"
 
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:DFD 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
-       (match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
+       (match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,O"))]
   "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
     || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
@@ -1523,7 +1522,7 @@ (define_insn "*mov<mode>_aarch64"
    ldr\\t%x0, %1
    str\\t%x1, %0
    mov\\t%x0, %x1
-   mov\\t%x0, %1"
+   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";"
   [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
                      f_loadd,f_stored,load_8,store_8,mov_reg,\
                      fconstd")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..e91c7eab0b3674ca34ac2f790c38fcd27986c35f 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -106,6 +106,12 @@ (define_constraint "M"
 
 (define_constraint "N"
  "A constant that can be used with a 64-bit MOV immediate operation."
+ (and (match_code "const_int")
+      (match_test "aarch64_move_imm (ival, DImode)")
+      (match_test "!aarch64_zeroextended_move_imm (ival)")))
+
+(define_constraint "O"
+ "A constant that can be used with a 32 or 64-bit MOV immediate operation."
  (and (match_code "const_int")
       (match_test "aarch64_move_imm (ival, DImode)")))
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -0,0 +1,41 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x7efefefefefefeff;
+}
+
+long f2 (void)
+{
+  return 0x12345678aaaaaaaa;
+}
+
+long f3 (void)
+{
+  return 0x1234cccccccc5678;
+}
+
+long f4 (void)
+{
+  return 0x7777123456787777;
+}
+
+long f5 (void)
+{
+  return 0x5555555512345678;
+}
+
+long f6 (void)
+{
+  return 0x1234bbbb5678bbbb;
+}
+
+long f7 (void)
+{
+  return 0x4444123444445678;
+}
+
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-19 15:33             ` Wilco Dijkstra
@ 2022-10-20  9:31               ` Richard Sandiford
  2022-10-20 12:23                 ` Wilco Dijkstra
  0 siblings, 1 reply; 11+ messages in thread
From: Richard Sandiford @ 2022-10-20  9:31 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Wilco Dijkstra via Gcc-patches

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> ping
>
>
>
> Hi Richard,
>
>>>> Sounds good, but could you put it before the mode version,
>>>> to avoid the forward declaration?
>>>
>>> I can swap them around but the forward declaration is still required as
>>> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.
>>
>> OK, how about moving them both above aarch64_check_bitmask?
>
> Sure I've moved them as well as all related helper functions - it makes the diff
> quite large but they are all together now which makes sense. I also refactored
> aarch64_mov_imm to handle the case of a 64-bit immediate being generated
> by a 32-bit MOVZ/MOVN - this simplifies aarch64_internal_move_immediate
> and movdi patterns even further.

Can you do the aarch64_mov_imm changes as a separate patch?  It's difficult
to review the two changes folded together like this.

Thanks,
Richard

>
> Cheers,
> Wilco
>
> v3: move immediate code together and avoid forward declarations,
> further cleanups and simplifications.
>
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  Simplify, refactor and improve
> efficiency of bitmask checks and move immediate. Move various immediate
> handling functions together to avoid forward declarations.
> Include 32-bit MOVZ/N as valid 64-bit immediates. Add new constraint so
> the movdi pattern only needs a single alternative for move immediate.
>
> This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.
>
> Passes bootstrap & regress, OK for commit?
>
> gcc/ChangeLog:
>
>         PR target/106583
>         * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
>         Add support for a bitmask immediate with 2 MOVKs.
>         (aarch64_check_bitmask): New function after refactorization.
>         (aarch64_replicate_bitmask_imm): Remove function, merge into...
>         (aarch64_bitmask_imm): Simplify replication of small modes.
>         Split function into 64-bit only version for efficiency.
>         (aarch64_zeroextended_move_imm): New function.
>         (aarch64_move_imm): Refactor code.
>         (aarch64_uimm12_shift): Move near other immediate functions.
>         (aarch64_clamp_to_uimm12_shift): Likewise.
>         (aarch64_movk_shift): Likewise.
>         (aarch64_replicate_bitmask_imm): Likewise.
>         (aarch64_and_split_imm1): Likewise.
>         (aarch64_and_split_imm2): Likewise.
>         (aarch64_and_bitmask_imm): Likewise.
>         (aarch64_movw_imm): Remove.
>         * config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
>         constraints into single 'O'.
>         (mov<mode>_aarch64): Likewise.
>         * config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
>         (aarch64_bitmask_imm): Likewise.
>         (aarch64_uimm12_shift): Likewise.
>         (aarch64_zeroextended_move_imm): New prototype.
>         * config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
>         limit 'N' to 64-bit only moves.
>
> gcc/testsuite:
>         PR target/106583
>         * gcc.target/aarch64/pr106583.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 3e4005c9f4ff1f999f1811c6fb0b2252878dc4ae..b82f9ba7c2bb4cffa16abbf45f87061f72015083 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
>  poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
>  int aarch64_get_condition_code (rtx);
>  bool aarch64_address_valid_for_prefetch_p (rtx, bool);
> -bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
> +bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
>  unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
>  unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
>  bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
> @@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
>                                          unsigned HOST_WIDE_INT,
>                                          unsigned HOST_WIDE_INT);
>  bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
> -bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
> +bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
>  machine_mode aarch64_sve_int_mode (machine_mode);
>  opt_machine_mode aarch64_sve_pred_mode (unsigned int);
>  machine_mode aarch64_sve_pred_mode (machine_mode);
> @@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
>  bool aarch64_sve_float_mul_immediate_p (rtx);
>  bool aarch64_split_dimode_const_store (rtx, rtx);
>  bool aarch64_symbolic_address_p (rtx);
> -bool aarch64_uimm12_shift (HOST_WIDE_INT);
> +bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
>  int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
> +bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
>  bool aarch64_use_return_insn_p (void);
>  const char *aarch64_output_casesi (rtx *);
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 4de55beb067ea8f0be0a90060a785c94bdee708b..785ec07692981d423582051ac0897e5dbc3a001f 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -305,7 +305,6 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
>  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>                                              aarch64_addr_query_type);
> -static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
>
>  /* The processor for which instructions should be scheduled.  */
>  enum aarch64_processor aarch64_tune = cortexa53;
> @@ -5502,6 +5501,142 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
>                                               factor, nelts_per_vq);
>  }
>
> +/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
> +
> +static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
> +  {
> +    0x0000000100000001ull,
> +    0x0001000100010001ull,
> +    0x0101010101010101ull,
> +    0x1111111111111111ull,
> +    0x5555555555555555ull,
> +  };
> +
> +
> +/* Return true if 64-bit VAL is a valid bitmask immediate.  */
> +static bool
> +aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
> +{
> +  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
> +  int bits;
> +
> +  /* Check for a single sequence of one bits and return quickly if so.
> +     The special cases of all ones and all zeroes returns false.  */
> +  tmp = val + (val & -val);
> +
> +  if (tmp == (tmp & -tmp))
> +    return (val + 1) > 1;
> +
> +  /* Invert if the immediate doesn't start with a zero bit - this means we
> +     only need to search for sequences of one bits.  */
> +  if (val & 1)
> +    val = ~val;
> +
> +  /* Find the first set bit and set tmp to val with the first sequence of one
> +     bits removed.  Return success if there is a single sequence of ones.  */
> +  first_one = val & -val;
> +  tmp = val & (val + first_one);
> +
> +  if (tmp == 0)
> +    return true;
> +
> +  /* Find the next set bit and compute the difference in bit position.  */
> +  next_one = tmp & -tmp;
> +  bits = clz_hwi (first_one) - clz_hwi (next_one);
> +  mask = val ^ tmp;
> +
> +  /* Check the bit position difference is a power of 2, and that the first
> +     sequence of one bits fits within 'bits' bits.  */
> +  if ((mask >> bits) != 0 || bits != (bits & -bits))
> +    return false;
> +
> +  /* Check the sequence of one bits is repeated 64/bits times.  */
> +  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
> +}
> +
> +
> +/* Return true if VAL is a valid bitmask immediate for any mode.  */
> +bool
> +aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
> +{
> +  if (mode == DImode)
> +    return aarch64_bitmask_imm (val);
> +
> +  if (mode == SImode)
> +    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
> +
> +  /* Replicate small immediates to fit 64 bits.  */
> +  int size = GET_MODE_UNIT_PRECISION (mode);
> +  val &= (HOST_WIDE_INT_1U << size) - 1;
> +  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
> +
> +  return aarch64_bitmask_imm (val);
> +}
> +
> +/* Return true if the immediate VAL can be a bitfield immediate
> +   by changing the given MASK bits in VAL to zeroes, ones or bits
> +   from the other half of VAL.  Return the new immediate in VAL2.  */
> +static inline bool
> +aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
> +                      unsigned HOST_WIDE_INT &val2,
> +                      unsigned HOST_WIDE_INT mask)
> +{
> +  val2 = val & ~mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val2 = val | mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val = val & ~mask;
> +  val2 = val | (((val >> 32) | (val << 32)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val2 = val | (((val >> 16) | (val << 48)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  return false;
> +}
> +
> +/* Return true if immediate VAL can only be created by using a 32-bit
> +   zero-extended move immediate, not by a 64-bit move.  */
> +bool
> +aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT val)
> +{
> +  if ((val >> 16) == 0 || (val >> 32) != 0 || (val & 0xffff) == 0)
> +    return false;
> +  return !aarch64_bitmask_imm (val);
> +}
> +
> +/* Return true if VAL is an immediate that can be created by a single
> +   MOV instruction.  */
> +bool
> +aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
> +{
> +  unsigned HOST_WIDE_INT val2;
> +
> +  if (val < 65536)
> +    return true;
> +
> +  val2 = val ^ ((HOST_WIDE_INT) val >> 63);
> +  if ((val2 >> (__builtin_ctzll (val2) & 48)) < 65536)
> +    return true;
> +
> +  /* Special case 0xyyyyffffffffffff. */
> +  if (((val2 + 1) << 16) == 0)
> +    return true;
> +
> +  /* Special case immediates 0xffffyyyy and 0xyyyyffff.  */
> +  val2 = (mode == DImode) ? val : val2;
> +  if (((val2 + 1) & ~(unsigned HOST_WIDE_INT) 0xffff0000) == 0
> +      || (val2 >> 16) == 0xffff)
> +    return true;
> +
> +  if (mode == SImode || (val >> 32) == 0)
> +    val = (val & 0xffffffff) | (val << 32);
> +  return aarch64_bitmask_imm (val);
> +}
> +
> +
>  static int
>  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>                                  scalar_int_mode mode)
> @@ -5520,31 +5655,6 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>        return 1;
>      }
>
> -  /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
> -     (with XXXX non-zero). In that case check to see if the move can be done in
> -     a smaller mode.  */
> -  val2 = val & 0xffffffff;
> -  if (mode == DImode
> -      && aarch64_move_imm (val2, SImode)
> -      && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
> -    {
> -      if (generate)
> -       emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> -
> -      /* Check if we have to emit a second instruction by checking to see
> -         if any of the upper 32 bits of the original DI mode value is set.  */
> -      if (val == val2)
> -       return 1;
> -
> -      i = (val >> 48) ? 48 : 32;
> -
> -      if (generate)
> -        emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> -                                   GEN_INT ((val >> i) & 0xffff)));
> -
> -      return 2;
> -    }
> -
>    if ((val >> 32) == 0 || mode == SImode)
>      {
>        if (generate)
> @@ -5568,26 +5678,20 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>    one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>      ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>
> -  if (zero_match != 2 && one_match != 2)
> +  /* Try a bitmask immediate and a movk to generate the immediate
> +     in 2 instructions.  */
> +  if (zero_match < 2 && one_match < 2)
>      {
> -      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
> -        For a 64-bit bitmask try whether changing 16 bits to all ones or
> -        zeroes creates a valid bitmask.  To check any repeated bitmask,
> -        try using 16 bits from the other 32-bit half of val.  */
> -
> -      for (i = 0; i < 64; i += 16, mask <<= 16)
> +      for (i = 0; i < 64; i += 16)
>          {
> -         val2 = val & ~mask;
> -         if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -           break;
> -         val2 = val | mask;
> -         if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +         if (aarch64_check_bitmask (val, val2, mask << i))
>              break;
> -         val2 = val2 & ~mask;
> -         val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> -         if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +
> +         val2 = val & ~(mask << i);
> +         if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
>              break;
>          }
> +
>        if (i != 64)
>          {
>            if (generate)
> @@ -5600,6 +5704,25 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>          }
>      }
>
> +  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
> +  if (zero_match + one_match == 0)
> +    {
> +      for (i = 0; i < 48; i += 16)
> +       for (int j = i + 16; j < 64; j += 16)
> +         if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
> +           {
> +             if (generate)
> +               {
> +                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +                                            GEN_INT ((val >> i) & 0xffff)));
> +                 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
> +                                              GEN_INT ((val >> j) & 0xffff)));
> +               }
> +             return 3;
> +           }
> +    }
> +
>    /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
>       are emitted by the initial mov.  If one_match > zero_match, skip set bits,
>       otherwise skip zero bits.  */
> @@ -5643,6 +5766,95 @@ aarch64_mov128_immediate (rtx imm)
>           + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
>  }
>
> +/* Return true if val can be encoded as a 12-bit unsigned immediate with
> +   a left shift of 0 or 12 bits.  */
> +bool
> +aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
> +{
> +  return val < 4096 || (val & 0xfff000) == val;
> +}
> +
> +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
> +   that can be created with a left shift of 0 or 12.  */
> +static HOST_WIDE_INT
> +aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
> +{
> +  /* Check to see if the value fits in 24 bits, as that is the maximum we can
> +     handle correctly.  */
> +  gcc_assert (val < 0x1000000);
> +
> +  if (val < 4096)
> +    return val;
> +
> +  return val & 0xfff000;
> +}
> +
> +/* Test whether:
> +
> +     X = (X & AND_VAL) | IOR_VAL;
> +
> +   can be implemented using:
> +
> +     MOVK X, #(IOR_VAL >> shift), LSL #shift
> +
> +   Return the shift if so, otherwise return -1.  */
> +int
> +aarch64_movk_shift (const wide_int_ref &and_val,
> +                   const wide_int_ref &ior_val)
> +{
> +  unsigned int precision = and_val.get_precision ();
> +  unsigned HOST_WIDE_INT mask = 0xffff;
> +  for (unsigned int shift = 0; shift < precision; shift += 16)
> +    {
> +      if (and_val == ~mask && (ior_val & mask) == ior_val)
> +       return shift;
> +      mask <<= 16;
> +    }
> +  return -1;
> +}
> +
> +/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
> +   Assumed precondition: VAL_IN Is not zero.  */
> +
> +unsigned HOST_WIDE_INT
> +aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
> +{
> +  int lowest_bit_set = ctz_hwi (val_in);
> +  int highest_bit_set = floor_log2 (val_in);
> +  gcc_assert (val_in != 0);
> +
> +  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
> +         (HOST_WIDE_INT_1U << lowest_bit_set));
> +}
> +
> +/* Create constant where bits outside of lowest bit set to highest bit set
> +   are set to 1.  */
> +
> +unsigned HOST_WIDE_INT
> +aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
> +{
> +  return val_in | ~aarch64_and_split_imm1 (val_in);
> +}
> +
> +/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
> +
> +bool
> +aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
> +{
> +  scalar_int_mode int_mode;
> +  if (!is_a <scalar_int_mode> (mode, &int_mode))
> +    return false;
> +
> +  if (aarch64_bitmask_imm (val_in, int_mode))
> +    return false;
> +
> +  if (aarch64_move_imm (val_in, int_mode))
> +    return false;
> +
> +  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
> +
> +  return aarch64_bitmask_imm (imm2, int_mode);
> +}
>
>  /* Return the number of temporary registers that aarch64_add_offset_1
>     would need to add OFFSET to a register.  */
> @@ -10098,208 +10310,6 @@ aarch64_tls_referenced_p (rtx x)
>    return false;
>  }
>
> -
> -/* Return true if val can be encoded as a 12-bit unsigned immediate with
> -   a left shift of 0 or 12 bits.  */
> -bool
> -aarch64_uimm12_shift (HOST_WIDE_INT val)
> -{
> -  return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
> -         || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
> -         );
> -}
> -
> -/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
> -   that can be created with a left shift of 0 or 12.  */
> -static HOST_WIDE_INT
> -aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
> -{
> -  /* Check to see if the value fits in 24 bits, as that is the maximum we can
> -     handle correctly.  */
> -  gcc_assert ((val & 0xffffff) == val);
> -
> -  if (((val & 0xfff) << 0) == val)
> -    return val;
> -
> -  return val & (0xfff << 12);
> -}
> -
> -/* Return true if val is an immediate that can be loaded into a
> -   register by a MOVZ instruction.  */
> -static bool
> -aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
> -{
> -  if (GET_MODE_SIZE (mode) > 4)
> -    {
> -      if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
> -         || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
> -       return 1;
> -    }
> -  else
> -    {
> -      /* Ignore sign extension.  */
> -      val &= (HOST_WIDE_INT) 0xffffffff;
> -    }
> -  return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
> -         || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
> -}
> -
> -/* Test whether:
> -
> -     X = (X & AND_VAL) | IOR_VAL;
> -
> -   can be implemented using:
> -
> -     MOVK X, #(IOR_VAL >> shift), LSL #shift
> -
> -   Return the shift if so, otherwise return -1.  */
> -int
> -aarch64_movk_shift (const wide_int_ref &and_val,
> -                   const wide_int_ref &ior_val)
> -{
> -  unsigned int precision = and_val.get_precision ();
> -  unsigned HOST_WIDE_INT mask = 0xffff;
> -  for (unsigned int shift = 0; shift < precision; shift += 16)
> -    {
> -      if (and_val == ~mask && (ior_val & mask) == ior_val)
> -       return shift;
> -      mask <<= 16;
> -    }
> -  return -1;
> -}
> -
> -/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
> -   64-bit (DImode) integer.  */
> -
> -static unsigned HOST_WIDE_INT
> -aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
> -{
> -  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
> -  while (size < 64)
> -    {
> -      val &= (HOST_WIDE_INT_1U << size) - 1;
> -      val |= val << size;
> -      size *= 2;
> -    }
> -  return val;
> -}
> -
> -/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
> -
> -static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
> -  {
> -    0x0000000100000001ull,
> -    0x0001000100010001ull,
> -    0x0101010101010101ull,
> -    0x1111111111111111ull,
> -    0x5555555555555555ull,
> -  };
> -
> -
> -/* Return true if val is a valid bitmask immediate.  */
> -
> -bool
> -aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
> -{
> -  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
> -  int bits;
> -
> -  /* Check for a single sequence of one bits and return quickly if so.
> -     The special cases of all ones and all zeroes returns false.  */
> -  val = aarch64_replicate_bitmask_imm (val_in, mode);
> -  tmp = val + (val & -val);
> -
> -  if (tmp == (tmp & -tmp))
> -    return (val + 1) > 1;
> -
> -  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
> -  if (mode == SImode)
> -    val = (val << 32) | (val & 0xffffffff);
> -
> -  /* Invert if the immediate doesn't start with a zero bit - this means we
> -     only need to search for sequences of one bits.  */
> -  if (val & 1)
> -    val = ~val;
> -
> -  /* Find the first set bit and set tmp to val with the first sequence of one
> -     bits removed.  Return success if there is a single sequence of ones.  */
> -  first_one = val & -val;
> -  tmp = val & (val + first_one);
> -
> -  if (tmp == 0)
> -    return true;
> -
> -  /* Find the next set bit and compute the difference in bit position.  */
> -  next_one = tmp & -tmp;
> -  bits = clz_hwi (first_one) - clz_hwi (next_one);
> -  mask = val ^ tmp;
> -
> -  /* Check the bit position difference is a power of 2, and that the first
> -     sequence of one bits fits within 'bits' bits.  */
> -  if ((mask >> bits) != 0 || bits != (bits & -bits))
> -    return false;
> -
> -  /* Check the sequence of one bits is repeated 64/bits times.  */
> -  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
> -}
> -
> -/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
> -   Assumed precondition: VAL_IN Is not zero.  */
> -
> -unsigned HOST_WIDE_INT
> -aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
> -{
> -  int lowest_bit_set = ctz_hwi (val_in);
> -  int highest_bit_set = floor_log2 (val_in);
> -  gcc_assert (val_in != 0);
> -
> -  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
> -         (HOST_WIDE_INT_1U << lowest_bit_set));
> -}
> -
> -/* Create constant where bits outside of lowest bit set to highest bit set
> -   are set to 1.  */
> -
> -unsigned HOST_WIDE_INT
> -aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
> -{
> -  return val_in | ~aarch64_and_split_imm1 (val_in);
> -}
> -
> -/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
> -
> -bool
> -aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
> -{
> -  scalar_int_mode int_mode;
> -  if (!is_a <scalar_int_mode> (mode, &int_mode))
> -    return false;
> -
> -  if (aarch64_bitmask_imm (val_in, int_mode))
> -    return false;
> -
> -  if (aarch64_move_imm (val_in, int_mode))
> -    return false;
> -
> -  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
> -
> -  return aarch64_bitmask_imm (imm2, int_mode);
> -}
> -
> -/* Return true if val is an immediate that can be loaded into a
> -   register in a single instruction.  */
> -bool
> -aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
> -{
> -  scalar_int_mode int_mode;
> -  if (!is_a <scalar_int_mode> (mode, &int_mode))
> -    return false;
> -
> -  if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
> -    return 1;
> -  return aarch64_bitmask_imm (val, int_mode);
> -}
> -
>  static bool
>  aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
>  {
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 0a7633e5dd6d45282edd7a1088c14b555bc09b40..23ceca48543d23b85beea1f0bf98ef83051d80b6 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1309,16 +1309,15 @@ (define_insn_and_split "*movsi_aarch64"
>  )
>
>  (define_insn_and_split "*movdi_aarch64"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
> -       (match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
> +       (match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,O,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
>    "(register_operand (operands[0], DImode)
>      || aarch64_reg_or_zero (operands[1], DImode))"
>    "@
>     mov\\t%x0, %x1
>     mov\\t%0, %x1
>     mov\\t%x0, %1
> -   mov\\t%x0, %1
> -   mov\\t%w0, %1
> +   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";
>     #
>     * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
>     ldr\\t%x0, %1
> @@ -1340,11 +1339,11 @@ (define_insn_and_split "*movdi_aarch64"
>         DONE;
>      }"
>    ;; The "mov_imm" type for CNTD is just a placeholder.
> -  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> +  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,
>                       load_8,load_8,store_8,store_8,load_8,adr,adr,f_mcr,f_mrc,
>                       fmov,neon_move")
> -   (set_attr "arch"   "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
> -   (set_attr "length" "4,4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
> +   (set_attr "arch"   "*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
> +   (set_attr "length" "4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
>  )
>
>  (define_insn "insv_imm<mode>"
> @@ -1508,7 +1507,7 @@ (define_insn "*mov<mode>_aarch64"
>
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:DFD 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
> -       (match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
> +       (match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,O"))]
>    "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
>      || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
>    "@
> @@ -1523,7 +1522,7 @@ (define_insn "*mov<mode>_aarch64"
>     ldr\\t%x0, %1
>     str\\t%x1, %0
>     mov\\t%x0, %x1
> -   mov\\t%x0, %1"
> +   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";"
>    [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
>                       f_loadd,f_stored,load_8,store_8,mov_reg,\
>                       fconstd")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..e91c7eab0b3674ca34ac2f790c38fcd27986c35f 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -106,6 +106,12 @@ (define_constraint "M"
>
>  (define_constraint "N"
>   "A constant that can be used with a 64-bit MOV immediate operation."
> + (and (match_code "const_int")
> +      (match_test "aarch64_move_imm (ival, DImode)")
> +      (match_test "!aarch64_zeroextended_move_imm (ival)")))
> +
> +(define_constraint "O"
> + "A constant that can be used with a 32 or 64-bit MOV immediate operation."
>   (and (match_code "const_int")
>        (match_test "aarch64_move_imm (ival, DImode)")))
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> @@ -0,0 +1,41 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> +  return 0x7efefefefefefeff;
> +}
> +
> +long f2 (void)
> +{
> +  return 0x12345678aaaaaaaa;
> +}
> +
> +long f3 (void)
> +{
> +  return 0x1234cccccccc5678;
> +}
> +
> +long f4 (void)
> +{
> +  return 0x7777123456787777;
> +}
> +
> +long f5 (void)
> +{
> +  return 0x5555555512345678;
> +}
> +
> +long f6 (void)
> +{
> +  return 0x1234bbbb5678bbbb;
> +}
> +
> +long f7 (void)
> +{
> +  return 0x4444123444445678;
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-20  9:31               ` Richard Sandiford
@ 2022-10-20 12:23                 ` Wilco Dijkstra
  2022-10-21  9:20                   ` Richard Sandiford
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2022-10-20 12:23 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Wilco Dijkstra via Gcc-patches

Hi Richard,

> Can you do the aarch64_mov_imm changes as a separate patch?  It's difficult
> to review the two changes folded together like this.

Sure, I'll send a separate patch. So here is version 2 again:

[PATCH v2][AArch64] Improve immediate expansion [PR106583]

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve
efficiency of bitmask checks.  This reduces the number of 4-instruction
immediates in SPECINT/FP by 10-15%.

Passes regress, OK for commit?

gcc/ChangeLog:

        PR target/106583
        * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
        Add support for a bitmask immediate with 2 MOVKs.
        (aarch64_check_bitmask): New function after refactorization.
        (aarch64_replicate_bitmask_imm): Remove function, merge into...
        (aarch64_bitmask_imm): Simplify replication of small modes.
        Split function into 64-bit only version for efficiency.

gcc/testsuite:
        PR target/106583
        * gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                             aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
                                              factor, nelts_per_vq);
 }
 
+/* Return true if the immediate VAL can be a bitfield immediate
+   by changing the given MASK bits in VAL to zeroes, ones or bits
+   from the other half of VAL.  Return the new immediate in VAL2.  */
+static inline bool
+aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
+                      unsigned HOST_WIDE_INT &val2,
+                      unsigned HOST_WIDE_INT mask)
+{
+  val2 = val & ~mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val = val & ~mask;
+  val2 = val | (((val >> 32) | (val << 32)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  val2 = val | (((val >> 16) | (val << 48)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+    return true;
+  return false;
+}
+
 static int
 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
                                 scalar_int_mode mode)
@@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  if (zero_match < 2 && one_match < 2)
     {
       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
          For a 64-bit bitmask try whether changing 16 bits to all ones or
          zeroes creates a valid bitmask.  To check any repeated bitmask,
          try using 16 bits from the other 32-bit half of val.  */
 
-      for (i = 0; i < 64; i += 16, mask <<= 16)
-       {
-         val2 = val & ~mask;
-         if (val2 != val && aarch64_bitmask_imm (val2, mode))
-           break;
-         val2 = val | mask;
-         if (val2 != val && aarch64_bitmask_imm (val2, mode))
-           break;
-         val2 = val2 & ~mask;
-         val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
-         if (val2 != val && aarch64_bitmask_imm (val2, mode))
-           break;
-       }
-      if (i != 64)
-       {
-         if (generate)
+      for (i = 0; i < 64; i += 16)
+       if (aarch64_check_bitmask (val, val2, mask << i))
+         {
+           if (generate)
+             {
+               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+                                          GEN_INT ((val >> i) & 0xffff)));
+             }
+           return 2;
+         }
+    }
+
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+    {
+      for (i = 0; i < 48; i += 16)
+       for (int j = i + 16; j < 64; j += 16)
+         if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
             {
-             emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
-             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
-                                        GEN_INT ((val >> i) & 0xffff)));
+             if (generate)
+               {
+                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+                                            GEN_INT ((val >> i) & 0xffff)));
+                 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
+                                              GEN_INT ((val >> j) & 0xffff)));
+               }
+             return 3;
             }
-         return 2;
-       }
     }
 
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
@@ -10168,22 +10200,6 @@ aarch64_movk_shift (const wide_int_ref &and_val,
   return -1;
 }
 
-/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
-   64-bit (DImode) integer.  */
-
-static unsigned HOST_WIDE_INT
-aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
-{
-  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
-  while (size < 64)
-    {
-      val &= (HOST_WIDE_INT_1U << size) - 1;
-      val |= val << size;
-      size *= 2;
-    }
-  return val;
-}
-
 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
 
 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
@@ -10196,26 +10212,42 @@ static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
   };
 
 
-/* Return true if val is a valid bitmask immediate.  */
-
+/* Return true if val is a valid bitmask immediate for any mode.  */
 bool
 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
 {
-  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
+  if (mode == DImode)
+    return aarch64_bitmask_imm (val_in);
+
+  unsigned HOST_WIDE_INT val = val_in;
+
+  if (mode == SImode)
+    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
+
+  /* Replicate small immediates to fit 64 bits.  */
+  int size = GET_MODE_UNIT_PRECISION (mode);
+  val &= (HOST_WIDE_INT_1U << size) - 1;
+  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
+
+  return aarch64_bitmask_imm (val);
+}
+
+
+/* Return true if 64-bit val is a valid bitmask immediate.  */
+
+static bool
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
+{
+  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
   int bits;
 
   /* Check for a single sequence of one bits and return quickly if so.
      The special cases of all ones and all zeroes returns false.  */
-  val = aarch64_replicate_bitmask_imm (val_in, mode);
   tmp = val + (val & -val);
 
   if (tmp == (tmp & -tmp))
     return (val + 1) > 1;
 
-  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
-  if (mode == SImode)
-    val = (val << 32) | (val & 0xffffffff);
-
   /* Invert if the immediate doesn't start with a zero bit - this means we
      only need to search for sequences of one bits.  */
   if (val & 1)
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -0,0 +1,41 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x7efefefefefefeff;
+}
+
+long f2 (void)
+{
+  return 0x12345678aaaaaaaa;
+}
+
+long f3 (void)
+{
+  return 0x1234cccccccc5678;
+}
+
+long f4 (void)
+{
+  return 0x7777123456787777;
+}
+
+long f5 (void)
+{
+  return 0x5555555512345678;
+}
+
+long f6 (void)
+{
+  return 0x1234bbbb5678bbbb;
+}
+
+long f7 (void)
+{
+  return 0x4444123444445678;
+}
+
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][AArch64] Improve immediate expansion [PR106583]
  2022-10-20 12:23                 ` Wilco Dijkstra
@ 2022-10-21  9:20                   ` Richard Sandiford
  0 siblings, 0 replies; 11+ messages in thread
From: Richard Sandiford @ 2022-10-21  9:20 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Wilco Dijkstra via Gcc-patches

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> Can you do the aarch64_mov_imm changes as a separate patch?  It's difficult
>> to review the two changes folded together like this.
>
> Sure, I'll send a separate patch. So here is version 2 again:

I still think we should move the functions to avoid the forward
declarations.  That part was fine (and OK to review).  It was folding
in the extra changes to the way that we generate move immediates that
made it difficult.

Could you send a patch that makes only the changes in v2, but moves
the functions around?  In fact, the positioning of the functions
in the v3 patch looked good, so the patch is OK with the contents
of v2 but the positioning of v3.

Thanks,
Richard

> [PATCH v2][AArch64] Improve immediate expansion [PR106583]
>
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  Simplify, refactor and improve
> efficiency of bitmask checks.  This reduces the number of 4-instruction
> immediates in SPECINT/FP by 10-15%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>
>         PR target/106583
>         * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
>         Add support for a bitmask immediate with 2 MOVKs.
>         (aarch64_check_bitmask): New function after refactorization.
>         (aarch64_replicate_bitmask_imm): Remove function, merge into...
>         (aarch64_bitmask_imm): Simplify replication of small modes.
>         Split function into 64-bit only version for efficiency.
>
> gcc/testsuite:
>         PR target/106583
>         * gcc.target/aarch64/pr106583.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>                                              aarch64_addr_query_type);
>  static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
> +static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
>
>  /* The processor for which instructions should be scheduled.  */
>  enum aarch64_processor aarch64_tune = cortexa53;
> @@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
>                                               factor, nelts_per_vq);
>  }
>
> +/* Return true if the immediate VAL can be a bitfield immediate
> +   by changing the given MASK bits in VAL to zeroes, ones or bits
> +   from the other half of VAL.  Return the new immediate in VAL2.  */
> +static inline bool
> +aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
> +                      unsigned HOST_WIDE_INT &val2,
> +                      unsigned HOST_WIDE_INT mask)
> +{
> +  val2 = val & ~mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val2 = val | mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val = val & ~mask;
> +  val2 = val | (((val >> 32) | (val << 32)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  val2 = val | (((val >> 16) | (val << 48)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +    return true;
> +  return false;
> +}
> +
>  static int
>  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>                                  scalar_int_mode mode)
> @@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>    one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>      ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>
> -  if (zero_match != 2 && one_match != 2)
> +  if (zero_match < 2 && one_match < 2)
>      {
>        /* Try emitting a bitmask immediate with a movk replacing 16 bits.
>           For a 64-bit bitmask try whether changing 16 bits to all ones or
>           zeroes creates a valid bitmask.  To check any repeated bitmask,
>           try using 16 bits from the other 32-bit half of val.  */
>
> -      for (i = 0; i < 64; i += 16, mask <<= 16)
> -       {
> -         val2 = val & ~mask;
> -         if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -           break;
> -         val2 = val | mask;
> -         if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -           break;
> -         val2 = val2 & ~mask;
> -         val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> -         if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -           break;
> -       }
> -      if (i != 64)
> -       {
> -         if (generate)
> +      for (i = 0; i < 64; i += 16)
> +       if (aarch64_check_bitmask (val, val2, mask << i))
> +         {
> +           if (generate)
> +             {
> +               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +                                          GEN_INT ((val >> i) & 0xffff)));
> +             }
> +           return 2;
> +         }
> +    }
> +
> +  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
> +  if (zero_match + one_match == 0)
> +    {
> +      for (i = 0; i < 48; i += 16)
> +       for (int j = i + 16; j < 64; j += 16)
> +         if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
>              {
> -             emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> -             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> -                                        GEN_INT ((val >> i) & 0xffff)));
> +             if (generate)
> +               {
> +                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +                                            GEN_INT ((val >> i) & 0xffff)));
> +                 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
> +                                              GEN_INT ((val >> j) & 0xffff)));
> +               }
> +             return 3;
>              }
> -         return 2;
> -       }
>      }
>
>    /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
> @@ -10168,22 +10200,6 @@ aarch64_movk_shift (const wide_int_ref &and_val,
>    return -1;
>  }
>
> -/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
> -   64-bit (DImode) integer.  */
> -
> -static unsigned HOST_WIDE_INT
> -aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
> -{
> -  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
> -  while (size < 64)
> -    {
> -      val &= (HOST_WIDE_INT_1U << size) - 1;
> -      val |= val << size;
> -      size *= 2;
> -    }
> -  return val;
> -}
> -
>  /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
>
>  static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
> @@ -10196,26 +10212,42 @@ static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
>    };
>
>
> -/* Return true if val is a valid bitmask immediate.  */
> -
> +/* Return true if val is a valid bitmask immediate for any mode.  */
>  bool
>  aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
>  {
> -  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
> +  if (mode == DImode)
> +    return aarch64_bitmask_imm (val_in);
> +
> +  unsigned HOST_WIDE_INT val = val_in;
> +
> +  if (mode == SImode)
> +    return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
> +
> +  /* Replicate small immediates to fit 64 bits.  */
> +  int size = GET_MODE_UNIT_PRECISION (mode);
> +  val &= (HOST_WIDE_INT_1U << size) - 1;
> +  val *= bitmask_imm_mul[__builtin_clz (size) - 26];
> +
> +  return aarch64_bitmask_imm (val);
> +}
> +
> +
> +/* Return true if 64-bit val is a valid bitmask immediate.  */
> +
> +static bool
> +aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
> +{
> +  unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
>    int bits;
>
>    /* Check for a single sequence of one bits and return quickly if so.
>       The special cases of all ones and all zeroes returns false.  */
> -  val = aarch64_replicate_bitmask_imm (val_in, mode);
>    tmp = val + (val & -val);
>
>    if (tmp == (tmp & -tmp))
>      return (val + 1) > 1;
>
> -  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
> -  if (mode == SImode)
> -    val = (val << 32) | (val & 0xffffffff);
> -
>    /* Invert if the immediate doesn't start with a zero bit - this means we
>       only need to search for sequences of one bits.  */
>    if (val & 1)
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> @@ -0,0 +1,41 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> +  return 0x7efefefefefefeff;
> +}
> +
> +long f2 (void)
> +{
> +  return 0x12345678aaaaaaaa;
> +}
> +
> +long f3 (void)
> +{
> +  return 0x1234cccccccc5678;
> +}
> +
> +long f4 (void)
> +{
> +  return 0x7777123456787777;
> +}
> +
> +long f5 (void)
> +{
> +  return 0x5555555512345678;
> +}
> +
> +long f6 (void)
> +{
> +  return 0x1234bbbb5678bbbb;
> +}
> +
> +long f7 (void)
> +{
> +  return 0x4444123444445678;
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-10-21  9:20 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-04 17:22 [PATCH][AArch64] Improve immediate expansion [PR106583] Wilco Dijkstra
2022-10-05  8:42 ` Richard Sandiford
2022-10-06 17:29   ` Wilco Dijkstra
2022-10-07 13:17     ` Richard Sandiford
2022-10-07 13:48       ` Wilco Dijkstra
2022-10-07 14:49         ` Richard Sandiford
2022-10-12 14:57           ` Wilco Dijkstra
2022-10-19 15:33             ` Wilco Dijkstra
2022-10-20  9:31               ` Richard Sandiford
2022-10-20 12:23                 ` Wilco Dijkstra
2022-10-21  9:20                   ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).