public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v2] AArch64: Improve GOT addressing
@ 2021-05-24 12:12 Wilco Dijkstra
  2021-05-26  9:09 ` Richard Sandiford
  0 siblings, 1 reply; 3+ messages in thread
From: Wilco Dijkstra @ 2021-05-24 12:12 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Sandiford, Kyrylo Tkachov

Version v2 uses movsi/di for GOT accesses until after reload as suggested. This
caused worse spilling, however improving the costs of GOT accesses resulted in
better codesize and performance gains:

Improve GOT addressing by treating the instructions as a pair.  This reduces
register pressure and improves code quality significantly.  SPECINT2017 improves
by 0.30% with -fPIC and codesize is 0.7% smaller.  Perlbench has 0.9% smaller
codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2021-05-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload.
        (movdi): Likewise.
        * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay
        splitting of GOT accesses until after reload.
        (aarch64_rtx_costs): Set rematerialization cost for GOT accesses.
        (aarch64_macro_fusion_pair_p): Fuse GOT accesses.

---

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 641c83b479e76cbcc75b299eb7ae5f634d9db7cd..75b3caa94dd8a52342bbddbfcb73ab06a7418907 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 
     case SYMBOL_SMALL_GOT_4G:
       {
+	/* Don't split into ADRP/LDR until after reload - this improves
+	   CSE and rematerialization of GOT accesses.  */
+	if (!reload_completed)
+	  {
+	    emit_insn (gen_rtx_SET (dest, imm));
+	    return;
+	  }
+
 	/* In ILP32, the mode of dest can be either SImode or DImode,
 	   while the got entry is always of SImode size.  The mode of
 	   dest depends on how dest is used: if dest is assigned to a
@@ -13460,6 +13468,14 @@ cost_plus:
 	  *cost += COSTS_N_INSNS (1);
 	  if (speed)
 	    *cost += 2 * extra_cost->alu.arith;
+
+	  /* Set a low remateralization cost for GOT accesses - this blocks
+	     them from being spilled and reduces register pressure.  */
+	  if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC
+	      && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G)
+	    *cost = COSTS_N_INSNS (1) / 2;
+
+	  return true;
 	}
       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
@@ -19930,6 +19946,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
       return aarch64_simd_valid_immediate (x, NULL);
     }
 
+  /* GOT accesses are split after regalloc.  */
+  if (SYMBOL_REF_P (x)
+      && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
+    return true;
+
   x = strip_salt (x);
   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
@@ -23746,6 +23767,24 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
         }
     }
 
+  /* Always treat GOT accesses as a pair to ensure they can be easily
+     identified and optimized in linkers.  */
+  if (simple_sets_p)
+    {
+      /*  We're trying to match:
+	  prev (adrp) == (set (reg r1) (high (symbol_ref ("SYM"))))
+	  curr (add) == (set (reg r0)
+			(unspec [(mem (lo_sum (reg r1) (symbol_ref ("SYM"))))]
+			 UNSPEC_GOTSMALLPIC))  */
+
+      if (satisfies_constraint_Ush (SET_SRC (prev_set))
+	  && REG_P (SET_DEST (prev_set))
+	  && REG_P (SET_DEST (curr_set))
+	  && GET_CODE (SET_SRC (curr_set)) == UNSPEC
+	  && XINT (SET_SRC (curr_set), 1) == UNSPEC_GOTSMALLPIC)
+	return true;
+    }
+
   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
     {
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index abfd84526745d029ad4953eabad6dd17b159a218..2527c96576a78f2071da20721143a27adeb1551b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
-  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+	&& (aarch64_classify_symbolic_expression (operands[1])
+	    == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
-   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+	&& (aarch64_classify_symbolic_expression (operands[1])
+	    == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] AArch64: Improve GOT addressing
  2021-05-24 12:12 [PATCH v2] AArch64: Improve GOT addressing Wilco Dijkstra
@ 2021-05-26  9:09 ` Richard Sandiford
  2021-05-26 12:24   ` Wilco Dijkstra
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Sandiford @ 2021-05-26  9:09 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GCC Patches, Kyrylo Tkachov

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> @@ -23746,6 +23767,24 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
>          }
>      }
>
> +  /* Always treat GOT accesses as a pair to ensure they can be easily
> +     identified and optimized in linkers.  */

Sorry, I guess I'd not given enough weight to this part from your
earlier reply.  I was focusing too much on “Well there are no
optimizations that benefit from them being split, and there is
no gain from scheduling them independently.”

Are we actually planning to do any linker relaxations here, or is this
purely theoretical?  If doing relaxations is a realistic possiblity then
I agree that would be a good/legitimate reason to use a single define_insn
for both instructions.  In that case though, there should be a comment
above the define_insn explaining that linker relaxation is the reason
for keeping the instructions together.

If the relaxations aren't a realistic possibilty then I don't think we
want this fusion change either.

Thanks,
Richard

> +  if (simple_sets_p)
> +    {
> +      /*  We're trying to match:
> +         prev (adrp) == (set (reg r1) (high (symbol_ref ("SYM"))))
> +         curr (add) == (set (reg r0)
> +                       (unspec [(mem (lo_sum (reg r1) (symbol_ref ("SYM"))))]
> +                        UNSPEC_GOTSMALLPIC))  */
> +
> +      if (satisfies_constraint_Ush (SET_SRC (prev_set))
> +         && REG_P (SET_DEST (prev_set))
> +         && REG_P (SET_DEST (curr_set))
> +         && GET_CODE (SET_SRC (curr_set)) == UNSPEC
> +         && XINT (SET_SRC (curr_set), 1) == UNSPEC_GOTSMALLPIC)
> +       return true;
> +    }
> +
>    if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
>      {
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index abfd84526745d029ad4953eabad6dd17b159a218..2527c96576a78f2071da20721143a27adeb1551b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
>     fmov\\t%w0, %s1
>     fmov\\t%s0, %s1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
> -  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +  "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
> +    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
> +    || (reload_completed
> +       && (aarch64_classify_symbolic_expression (operands[1])
> +           == SYMBOL_SMALL_GOT_4G))"
>     [(const_int 0)]
>     "{
>         aarch64_expand_mov_immediate (operands[0], operands[1]);
> @@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
>     fmov\\t%x0, %d1
>     fmov\\t%d0, %d1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
> -   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)
> +    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
> +    || (reload_completed
> +       && (aarch64_classify_symbolic_expression (operands[1])
> +           == SYMBOL_SMALL_GOT_4G))"
>     [(const_int 0)]
>     "{
>         aarch64_expand_mov_immediate (operands[0], operands[1]);

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] AArch64: Improve GOT addressing
  2021-05-26  9:09 ` Richard Sandiford
@ 2021-05-26 12:24   ` Wilco Dijkstra
  0 siblings, 0 replies; 3+ messages in thread
From: Wilco Dijkstra @ 2021-05-26 12:24 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: GCC Patches, Kyrylo Tkachov

Hi Richard,

> Are we actually planning to do any linker relaxations here, or is this
> purely theoretical?  If doing relaxations is a realistic possiblity then
> I agree that would be a good/legitimate reason to use a single define_insn
> for both instructions.  In that case though, there should be a comment
> above the define_insn explaining that linker relaxation is the reason
> for keeping the instructions together.

Yes, enabling linker relaxations is a key goal of this patch - it's a chicken
and egg problem since compilers split the instructions and schedule them
apart for no good reason, making such relaxations impossible. It turns out
that splitting early is very bad for code quality, so we can achieve smaller
and faster code as well.

I'll merge in the .md changes of the previous version so we don't need
scheduling fusion.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-05-26 12:24 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-24 12:12 [PATCH v2] AArch64: Improve GOT addressing Wilco Dijkstra
2021-05-26  9:09 ` Richard Sandiford
2021-05-26 12:24   ` Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).