[PATCH] AArch64: Improve immediate expansion [PR105928]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] AArch64: Improve immediate expansion [PR105928]
@ 2023-09-14 15:24 Wilco Dijkstra
  2023-09-17 11:04 ` Richard Sandiford
  0 siblings, 1 reply; 5+ messages in thread
From: Wilco Dijkstra @ 2023-09-14 15:24 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Sandiford, Kyrylo Tkachov


Support immediate expansion of immediates which can be created from 2 MOVKs
and a shifted ORR or BIC instruction.  Change aarch64_split_dimode_const_store
to apply if we save one instruction.

This reduces the number of 4-instruction immediates in SPECINT/FP by 5%.

Passes regress, OK for commit?

gcc/ChangeLog:
	PR target/105928
	* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
	Add support for immediates using shifted ORR/BIC.
        (aarch64_split_dimode_const_store): Apply if we save one instruction.
        * config/aarch64/aarch64.md (<LOGICAL:optab>_<SHIFT:optab><mode>3): 
        Make pattern global.

gcc/testsuite:
	PR target/105928
	* gcc.target/aarch64/pr105928.c: Add new test.
        * gcc.target/aarch64/vect-cse-codegen.c: Fix test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c44c0b979d0cc3755c61dcf566cfddedccebf1ea..832f8197ac8d1a04986791e6f3e51861e41944b2 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5639,7 +5639,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 				machine_mode mode)
 {
   int i;
-  unsigned HOST_WIDE_INT val, val2, mask;
+  unsigned HOST_WIDE_INT val, val2, val3, mask;
   int one_match, zero_match;
   int num_insns;
 
@@ -5721,6 +5721,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 		}
 	      return 3;
 	    }
+
+      /* Try shifting and inserting the bottom 32-bits into the top bits.  */
+      val2 = val & 0xffffffff;
+      val3 = 0xffffffff;
+      val3 = val2 | (val3 << 32);
+      for (i = 17; i < 48; i++)
+	if ((val2 | (val2 << i)) == val)
+	  {
+	    if (generate)
+	      {
+		emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
+		emit_insn (gen_insv_immdi (dest, GEN_INT (16),
+					   GEN_INT (val2 >> 16)));
+		emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
+	      }
+	    return 3;
+	  }
+	else if ((val3 & ~(val3 << i)) == val)
+	  {
+	    if (generate)
+	      {
+		emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
+		emit_insn (gen_insv_immdi (dest, GEN_INT (16),
+					   GEN_INT (val2 >> 16)));
+		emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
+						      dest));
+	      }
+	    return 3;
+	  }
     }
 
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
@@ -25506,8 +25535,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
   rtx lo = gen_lowpart (SImode, src);
   rtx hi = gen_highpart_mode (SImode, DImode, src);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   if (!rtx_equal_p (lo, hi))
     return false;
 
@@ -25526,14 +25553,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
      MOV	w1, 49370
      MOVK	w1, 0x140, lsl 16
      STP	w1, w1, [x0]
-   So we want to perform this only when we save two instructions
-   or more.  When optimizing for size, however, accept any code size
-   savings we can.  */
-  if (size_p && orig_cost <= lo_cost)
-    return false;
-
-  if (!size_p
-      && (orig_cost <= lo_cost + 1))
+   So we want to perform this when we save at least one instruction.  */
+  if (orig_cost <= lo_cost)
     return false;
 
   rtx mem_lo = adjust_address (dst, SImode, 0);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 97f70d39cc0ddeb330e044bae0544d85a695567d..932d4d47a5db1a74e0d0565b565afbd769090853 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4618,7 +4618,7 @@ (define_insn "*and_<SHIFT:optab>si3_compare0_uxtw"
   [(set_attr "type" "logics_shift_imm")]
 )
 
-(define_insn "*<LOGICAL:optab>_<SHIFT:optab><mode>3"
+(define_insn "<LOGICAL:optab>_<SHIFT:optab><mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(LOGICAL:GPI (SHIFT:GPI
 		      (match_operand:GPI 1 "register_operand" "r")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr105928.c b/gcc/testsuite/gcc.target/aarch64/pr105928.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab52247df66020d0b8fe70bc81f572e8b64c2bb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr105928.c
@@ -0,0 +1,43 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x80402010080400;
+}
+
+long f2 (void)
+{
+  return 0x1234567812345678;
+}
+
+long f3 (void)
+{
+  return 0x4567800012345678;
+}
+
+long f4 (void)
+{
+  return 0x3ecccccd3ecccccd;
+}
+
+long f5 (void)
+{
+  return 0x38e38e38e38e38e;
+}
+
+long f6 (void)
+{
+  return 0x1745d1745d1745d;
+}
+
+void f7 (long *p)
+{
+  *p = 0x1234567812345678;
+}
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */
+/* { dg-final { scan-assembler-times {\tbic\t} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tstp\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
index d025e989a1e67f00f4f4ce94897a961d38abfab7..2b8e64313bb47f995f071c728b1d84473807bc64 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -72,8 +72,7 @@ test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
 **	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
 **	mov	x[0-9]+, 16512
 **	movk	x[0-9]+, 0x1020, lsl 16
-**	movk	x[0-9]+, 0x408, lsl 32
-**	movk	x[0-9]+, 0x102, lsl 48
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+, lsl 28
 **	fmov	d[0-9]+, x[0-9]+
 **	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
 **	dup	v[0-9]+.2d, v[0-9]+.d\[0\]


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AArch64: Improve immediate expansion [PR105928]
  2023-09-14 15:24 [PATCH] AArch64: Improve immediate expansion [PR105928] Wilco Dijkstra
@ 2023-09-17 11:04 ` Richard Sandiford
  2023-09-18 17:41   ` Wilco Dijkstra
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Sandiford @ 2023-09-17 11:04 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GCC Patches, Kyrylo Tkachov

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Support immediate expansion of immediates which can be created from 2 MOVKs
> and a shifted ORR or BIC instruction.  Change aarch64_split_dimode_const_store
> to apply if we save one instruction.
>
> This reduces the number of 4-instruction immediates in SPECINT/FP by 5%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>         PR target/105928
>         * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
>         Add support for immediates using shifted ORR/BIC.
>         (aarch64_split_dimode_const_store): Apply if we save one instruction.
>         * config/aarch64/aarch64.md (<LOGICAL:optab>_<SHIFT:optab><mode>3):
>         Make pattern global.
>
> gcc/testsuite:
>         PR target/105928
>         * gcc.target/aarch64/pr105928.c: Add new test.
>         * gcc.target/aarch64/vect-cse-codegen.c: Fix test.

Looks good apart from a comment below about the test.

I was worried that reusing "dest" for intermediate results would
prevent CSE for cases like:

void g (long long, long long);
void
f (long long *ptr)
{
  g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
}

where the same 32-bit lowpart pattern is used for two immediates.
In principle, that could be avoided using:

	    if (generate)
	      {
		rtx tmp = aarch64_target_reg (dest, DImode);
		emit_insn (gen_rtx_SET (tmp, GEN_INT (val2 & 0xffff)));
		emit_insn (gen_insv_immdi (tmp, GEN_INT (16),
					   GEN_INT (val2 >> 16)));
		set_unique_reg_note (get_last_insn (), REG_EQUAL,
				     GEN_INT (val2));
		emit_insn (gen_ior_ashldi3 (dest, tmp, GEN_INT (i), tmp));
	      }
	    return 3;

But it doesn't work, since we only expose the individual immediates
during split1, and nothing between split1 and ira is able to remove
redundancies.  There's no point complicating the code for a theoretical
future optimisation.

> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c44c0b979d0cc3755c61dcf566cfddedccebf1ea..832f8197ac8d1a04986791e6f3e51861e41944b2 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -5639,7 +5639,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>                                 machine_mode mode)
>  {
>    int i;
> -  unsigned HOST_WIDE_INT val, val2, mask;
> +  unsigned HOST_WIDE_INT val, val2, val3, mask;
>    int one_match, zero_match;
>    int num_insns;
>
> @@ -5721,6 +5721,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>                 }
>               return 3;
>             }
> +
> +      /* Try shifting and inserting the bottom 32-bits into the top bits.  */
> +      val2 = val & 0xffffffff;
> +      val3 = 0xffffffff;
> +      val3 = val2 | (val3 << 32);
> +      for (i = 17; i < 48; i++)
> +       if ((val2 | (val2 << i)) == val)
> +         {
> +           if (generate)
> +             {
> +               emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
> +               emit_insn (gen_insv_immdi (dest, GEN_INT (16),
> +                                          GEN_INT (val2 >> 16)));
> +               emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
> +             }
> +           return 3;
> +         }
> +       else if ((val3 & ~(val3 << i)) == val)
> +         {
> +           if (generate)
> +             {
> +               emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
> +               emit_insn (gen_insv_immdi (dest, GEN_INT (16),
> +                                          GEN_INT (val2 >> 16)));
> +               emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
> +                                                     dest));
> +             }
> +           return 3;
> +         }
>      }
>
>    /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
> @@ -25506,8 +25535,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
>    rtx lo = gen_lowpart (SImode, src);
>    rtx hi = gen_highpart_mode (SImode, DImode, src);
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> -
>    if (!rtx_equal_p (lo, hi))
>      return false;
>
> @@ -25526,14 +25553,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
>       MOV       w1, 49370
>       MOVK      w1, 0x140, lsl 16
>       STP       w1, w1, [x0]
> -   So we want to perform this only when we save two instructions
> -   or more.  When optimizing for size, however, accept any code size
> -   savings we can.  */
> -  if (size_p && orig_cost <= lo_cost)
> -    return false;
> -
> -  if (!size_p
> -      && (orig_cost <= lo_cost + 1))
> +   So we want to perform this when we save at least one instruction.  */
> +  if (orig_cost <= lo_cost)
>      return false;
>
>    rtx mem_lo = adjust_address (dst, SImode, 0);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 97f70d39cc0ddeb330e044bae0544d85a695567d..932d4d47a5db1a74e0d0565b565afbd769090853 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4618,7 +4618,7 @@ (define_insn "*and_<SHIFT:optab>si3_compare0_uxtw"
>    [(set_attr "type" "logics_shift_imm")]
>  )
>
> -(define_insn "*<LOGICAL:optab>_<SHIFT:optab><mode>3"
> +(define_insn "<LOGICAL:optab>_<SHIFT:optab><mode>3"
>    [(set (match_operand:GPI 0 "register_operand" "=r")
>         (LOGICAL:GPI (SHIFT:GPI
>                       (match_operand:GPI 1 "register_operand" "r")
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr105928.c b/gcc/testsuite/gcc.target/aarch64/pr105928.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ab52247df66020d0b8fe70bc81f572e8b64c2bb5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr105928.c
> @@ -0,0 +1,43 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> +  return 0x80402010080400;
> +}
> +
> +long f2 (void)
> +{
> +  return 0x1234567812345678;
> +}
> +
> +long f3 (void)
> +{
> +  return 0x4567800012345678;
> +}
> +
> +long f4 (void)
> +{
> +  return 0x3ecccccd3ecccccd;
> +}
> +
> +long f5 (void)
> +{
> +  return 0x38e38e38e38e38e;
> +}
> +
> +long f6 (void)
> +{
> +  return 0x1745d1745d1745d;
> +}
> +
> +void f7 (long *p)
> +{
> +  *p = 0x1234567812345678;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */
> +/* { dg-final { scan-assembler-times {\tbic\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\torr\t} 4 } } */
> +/* { dg-final { scan-assembler-times {\tstp\t} 1 } } */

Using "long" isn't ILP32-friendly.  It needs to be "long long"
(or uint64_t, etc.) instead.

OK with that change, thanks.

Richard

> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> index d025e989a1e67f00f4f4ce94897a961d38abfab7..2b8e64313bb47f995f071c728b1d84473807bc64 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> @@ -72,8 +72,7 @@ test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
>  **     ushr    v[0-9]+.16b, v[0-9]+.16b, 7
>  **     mov     x[0-9]+, 16512
>  **     movk    x[0-9]+, 0x1020, lsl 16
> -**     movk    x[0-9]+, 0x408, lsl 32
> -**     movk    x[0-9]+, 0x102, lsl 48
> +**     orr     x[0-9]+, x[0-9]+, x[0-9]+, lsl 28
>  **     fmov    d[0-9]+, x[0-9]+
>  **     pmull   v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
>  **     dup     v[0-9]+.2d, v[0-9]+.d\[0\]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AArch64: Improve immediate expansion [PR105928]
  2023-09-17 11:04 ` Richard Sandiford
@ 2023-09-18 17:41   ` Wilco Dijkstra
  2023-09-19  8:49     ` Richard Sandiford
  0 siblings, 1 reply; 5+ messages in thread
From: Wilco Dijkstra @ 2023-09-18 17:41 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: GCC Patches, Kyrylo Tkachov

Hi Richard,

> I was worried that reusing "dest" for intermediate results would
> prevent CSE for cases like:
>
> void g (long long, long long);
> void
> f (long long *ptr)
> {
>   g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
> }

Note that aarch64_internal_mov_immediate may be called after reload,
so it would end up even more complex. This should be done as a
dedicated mid-end optimization similar to TARGET_CONST_ANCHOR.
However the number of 3/4-instruction immediates is so small that
sharable cases would be very rare, so I don't believe it is worth it.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AArch64: Improve immediate expansion [PR105928]
  2023-09-18 17:41   ` Wilco Dijkstra
@ 2023-09-19  8:49     ` Richard Sandiford
  2023-09-19 12:57       ` Wilco Dijkstra
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Sandiford @ 2023-09-19  8:49 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GCC Patches, Kyrylo Tkachov

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> I was worried that reusing "dest" for intermediate results would
>> prevent CSE for cases like:
>>
>> void g (long long, long long);
>> void
>> f (long long *ptr)
>> {
>>   g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
>> }
>
> Note that aarch64_internal_mov_immediate may be called after reload,
> so it would end up even more complex.

The sequence I quoted was supposed to work before and after reload.  The:

		rtx tmp = aarch64_target_reg (dest, DImode);

would create a fresh temporary before reload and reuse dest otherwise.
So the sequence after reload would be the same as in your patch,
but the sequence before reload would use a temporary.

> This should be done as a
> dedicated mid-end optimization similar to TARGET_CONST_ANCHOR.
> However the number of 3/4-instruction immediates is so small that
> sharable cases would be very rare, so I don't believe it is worth it.

Yeah.  If, with a few tweaks, we could easily reuse the existing pass
flow to optimise the split forms, then it might have been worth it.
But I agree it's not worth doing something special that only works
for multi-insn immediates.

I think there are other cases where CSE after split would help though.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AArch64: Improve immediate expansion [PR105928]
  2023-09-19  8:49     ` Richard Sandiford
@ 2023-09-19 12:57       ` Wilco Dijkstra
  0 siblings, 0 replies; 5+ messages in thread
From: Wilco Dijkstra @ 2023-09-19 12:57 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: GCC Patches, Kyrylo Tkachov

Hi Richard,

>> Note that aarch64_internal_mov_immediate may be called after reload,
>> so it would end up even more complex.
>
> The sequence I quoted was supposed to work before and after reload.  The:
>
>                rtx tmp = aarch64_target_reg (dest, DImode);
>
> would create a fresh temporary before reload and reuse dest otherwise.
> So the sequence after reload would be the same as in your patch,
> but the sequence before reload would use a temporary.

aarch64_target_reg just returns the input register so it won't do that.
Also the movsi/movdi patterns only split if the destination register is physical.
That's typically after register allocation but not uniformly so (eg. immediates in
returns will get split early), which is inconsistent. Given we always emit register
notes it's not obvious whether splitting early or late is better overall.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-09-19 12:58 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-14 15:24 [PATCH] AArch64: Improve immediate expansion [PR105928] Wilco Dijkstra
2023-09-17 11:04 ` Richard Sandiford
2023-09-18 17:41   ` Wilco Dijkstra
2023-09-19  8:49     ` Richard Sandiford
2023-09-19 12:57       ` Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).