public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Improve optimizer to avoid stack spill across pure function call
@ 2024-06-14 11:10 user202729
  0 siblings, 0 replies; only message in thread
From: user202729 @ 2024-06-14 11:10 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2729 bytes --]

This patch was inspired from PR 110137. It reduces the amount of stack spilling by ensuring that more values are constant across a pure function call.

It does not add any new flag; rather, it makes the optimizer generate more optimal code.

For the added test file, the change is the following. As can be seen, the number of memory operations is cut in half (almost, because rbx = rdi also need to be saved in the "after" version).

Before:

	_Z2ggO7MyClass:
	.LFB653:
		.cfi_startproc
		sub	rsp, 72
		.cfi_def_cfa_offset 80
		movdqu	xmm1, XMMWORD PTR [rdi]
		movdqu	xmm0, XMMWORD PTR [rdi+16]
		movaps	XMMWORD PTR [rsp+16], xmm1
		movaps	XMMWORD PTR [rsp], xmm0
		call	_Z1fv
		movdqa	xmm1, XMMWORD PTR [rsp+16]
		movdqa	xmm0, XMMWORD PTR [rsp]
		lea	rdx, [rsp+32]
		movaps	XMMWORD PTR [rsp+32], xmm1
		movaps	XMMWORD PTR [rsp+48], xmm0
		add	rsp, 72
		.cfi_def_cfa_offset 8
		ret
		.cfi_endproc

After:

	_Z2ggO7MyClass:
	.LFB653:
		.cfi_startproc
		push	rbx
		.cfi_def_cfa_offset 16
		.cfi_offset 3, -16
		mov	rbx, rdi
		sub	rsp, 32
		.cfi_def_cfa_offset 48
		call	_Z1fv
		movdqu	xmm0, XMMWORD PTR [rbx]
		movaps	XMMWORD PTR [rsp], xmm0
		movdqu	xmm0, XMMWORD PTR [rbx+16]
		movaps	XMMWORD PTR [rsp+16], xmm0
		add	rsp, 32
		.cfi_def_cfa_offset 16
		pop	rbx
		.cfi_def_cfa_offset 8
		ret
		.cfi_endproc

As explained in PR 110137, the reason I modify the RTL pass instead of the GIMPLE pass is that currently the code that handle the optimization is in the IRA.

The optimization involved is: rewrite

definition: a = something;
...
use a;

to move the definition statement right before the use statement, provided none of the statements inbetween modifies "something".

The existing code only handle the case where "something" is a memory reference with a fixed address. The patch modifies the logic to also allow memory reference whose address is not changed by the statements inbetween.

In order to do that the only way I can think of is to modify "validate_equiv_mem" to also validate the equivalence of the address, which may consist of a pseudo-register.

Nevertheless, reviews and suggestions to improve the code/explain how to implement it in GIMPLE phase would be appreciated.

Bootstrapped and regression tested on x86_64-pc-linux-gnu.

I think the test passes but there are some spurious failures with some scan-assembler-* tests, looking through it it doesn't appear to pessimize the code.

I think this also fix PR 103541 as a side effect, but I'm not sure if the generated code is optimal (it loads from the global variable twice, but then there's no readily usable caller-saved register so you need an additional memory operation anyway)

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: a.patch --]
[-- Type: text/x-patch; name=a.patch, Size: 10864 bytes --]

From 3d8d04373716e031242678821c3e9904b6ac51cb Mon Sep 17 00:00:00 2001
From: user202729 <user202729@protonmail.com>
Date: Mon, 20 May 2024 18:12:58 +0800
Subject: [PATCH] Allow moving init from dereference across pure function call

gcc/ChangeLog:

	* ira.cc (struct equiv_mem_data): Rename to equiv_data.
	(validate_equiv_mem_from_store): Rename to validate_equiv_from_store.
	(uses_any_clobbered_hard_reg_p): New function.
	(validate_equiv_mem): Rename to valid_equiv.
	(equiv_init_movable_p): Relax several conditions.
	(update_equiv_regs): Relax condition that the source must be mem.
	(add_store_equivs): Update changed function name.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avoid-stack-spill-pure-call.C: New test.
---
 gcc/ira.cc                                    | 157 ++++++++++++------
 .../i386/avoid-stack-spill-pure-call.C        |  32 ++++
 2 files changed, 140 insertions(+), 49 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 5642aea3caa..1a1718a18c5 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -3054,28 +3054,80 @@ struct equivalence
 static struct equivalence *reg_equiv;
 
 /* Used for communication between the following two functions.  */
-struct equiv_mem_data
+struct equiv_data
 {
-  /* A MEM that we wish to ensure remains unchanged.  */
-  rtx equiv_mem;
+  /* A rtx that we wish to ensure remains unchanged.  */
+  rtx equiv;
 
-  /* Set true if EQUIV_MEM is modified.  */
-  bool equiv_mem_modified;
+  /* Set true if EQUIV is modified.  */
+  bool equiv_modified;
 };
 
-/* If EQUIV_MEM is modified by modifying DEST, indicate that it is modified.
+/* If EQUIV is modified by modifying DEST, indicate that it is modified.
    Called via note_stores.  */
 static void
-validate_equiv_mem_from_store (rtx dest, const_rtx set ATTRIBUTE_UNUSED,
-			       void *data)
+validate_equiv_from_store (rtx dest, const_rtx set ATTRIBUTE_UNUSED,
+			   void *data)
 {
-  struct equiv_mem_data *info = (struct equiv_mem_data *) data;
+  struct equiv_data *info = (struct equiv_data *) data;
 
-  if ((REG_P (dest)
-       && reg_overlap_mentioned_p (dest, info->equiv_mem))
-      || (MEM_P (dest)
-	  && anti_dependence (info->equiv_mem, dest)))
-    info->equiv_mem_modified = true;
+  if (REG_P (dest))
+    {
+      if (reg_overlap_mentioned_p (dest, info->equiv))
+	info->equiv_modified = true;
+    }
+  else if (MEM_P (dest))
+    {
+      if (REG_P (info->equiv))
+	{
+	  /* info->equiv won't be modified by writing to a memory address.  */
+	}
+      else if (MEM_P (info->equiv))
+	{
+	  if (anti_dependence (info->equiv, dest))
+	    info->equiv_modified = true;
+	}
+      else
+	info->equiv_modified = true; /* just in case.  */
+    }
+  else
+    info->equiv_modified = true; /* just in case.  */
+}
+
+/* Returns true if x uses any hard register that can be clobbered across
+   a function call.  */
+static bool
+uses_any_clobbered_hard_reg_p (rtx x)
+{
+  int i, j;
+  const char *fmt;
+  enum rtx_code code;
+
+  if (x == NULL_RTX)
+    return false;
+  code = GET_CODE (x);
+
+  if (SUBREG_P (x))
+    return uses_any_clobbered_hard_reg_p (SUBREG_REG (x));
+  if (REG_P (x))
+    return HARD_REGISTER_P (x)
+      && crtl->abi->clobbers_at_least_part_of_reg_p (REGNO (x));
+  fmt = GET_RTX_FORMAT (code);
+  for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+    {
+      if (fmt[i] == 'e')
+	{
+	  if (uses_any_clobbered_hard_reg_p (XEXP (x, i)))
+	    return true;
+	}
+      else if (fmt[i] == 'E')
+	{
+	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+	    if (uses_any_clobbered_hard_reg_p (XVECEXP (x, i, j)))
+	      return true;
+	}
+    }
+  return false;
 }
 
 static bool equiv_init_varies_p (rtx x);
@@ -3083,24 +3135,24 @@ static bool equiv_init_varies_p (rtx x);
 enum valid_equiv { valid_none, valid_combine, valid_reload };
 
 /* Verify that no store between START and the death of REG invalidates
-   MEMREF.  MEMREF is invalidated by modifying a register used in MEMREF,
+   X.  X is invalidated by modifying a register used in X,
    by storing into an overlapping memory location, or with a non-const
    CALL_INSN.
 
-   Return VALID_RELOAD if MEMREF remains valid for both reload and
-   combine_and_move insns, VALID_COMBINE if only valid for
+   Return VALID_RELOAD if X remains valid for both reload and
+   combine_and_move_insns, VALID_COMBINE if only valid for
    combine_and_move_insns, and VALID_NONE otherwise.  */
 static enum valid_equiv
-validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
+validate_equiv (rtx_insn *start, rtx reg, rtx x)
 {
   rtx_insn *insn;
   rtx note;
-  struct equiv_mem_data info = { memref, false };
+  struct equiv_data info = { x, false };
   enum valid_equiv ret = valid_reload;
 
   /* If the memory reference has side effects or is volatile, it isn't a
      valid equivalence.  */
-  if (side_effects_p (memref))
+  if (side_effects_p (x))
     return valid_none;
 
   for (insn = start; insn; insn = NEXT_INSN (insn))
@@ -3113,35 +3165,43 @@ validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
 
       if (CALL_P (insn))
 	{
-	  /* We can combine a reg def from one insn into a reg use in
-	     another over a call if the memory is readonly or the call
-	     const/pure.  However, we can't set reg_equiv notes up for
-	     reload over any call.  The problem is the equivalent form
-	     may reference a pseudo which gets assigned a call
-	     clobbered hard reg.  When we later replace REG with its
-	     equivalent form, the value in the call-clobbered reg has
-	     been changed and all hell breaks loose.  */
-	  ret = valid_combine;
-	  if (!MEM_READONLY_P (memref)
-	      && (!RTL_CONST_OR_PURE_CALL_P (insn)
-		  || equiv_init_varies_p (XEXP (memref, 0))))
-	    return valid_none;
+	  if (RTL_CONST_OR_PURE_CALL_P (insn)
+	      && !uses_any_clobbered_hard_reg_p (x))
+	    {
+	      /* All is good, the call cannot modify x
+		 in the situation explained below.  */
+	    }
+	  else
+	    {
+	      /* We can combine a reg def from one insn into a reg use in
+		 another over a call if the memory is readonly or the call
+		 const/pure.  However, we can't set reg_equiv notes up for
+		 reload over any call.  The problem is the equivalent form
+		 may reference a pseudo which gets assigned a call
+		 clobbered hard reg.  When we later replace REG with its
+		 equivalent form, the value in the call-clobbered reg has
+		 been changed and all hell breaks loose.  */
+	      ret = valid_combine;
+	      if (MEM_P (x) && !MEM_READONLY_P (x)
+		  && (!RTL_CONST_OR_PURE_CALL_P (insn)
+		      || equiv_init_varies_p (XEXP (x, 0))))
+		return valid_none;
+	    }
 	}
 
-      note_stores (insn, validate_equiv_mem_from_store, &info);
-      if (info.equiv_mem_modified)
+      note_stores (insn, validate_equiv_from_store, &info);
+      if (info.equiv_modified)
 	return valid_none;
 
-      /* If a register mentioned in MEMREF is modified via an
+      /* If a register mentioned in X is modified via an
 	 auto-increment, we lose the equivalence.  Do the same if one
 	 dies; although we could extend the life, it doesn't seem worth
 	 the trouble.  */
 
       for (note = REG_NOTES (insn); note; note = XEXP (note, 1))
-	if ((REG_NOTE_KIND (note) == REG_INC
-	     || REG_NOTE_KIND (note) == REG_DEAD)
+	if (REG_NOTE_KIND (note) == REG_INC
 	    && REG_P (XEXP (note, 0))
-	    && reg_overlap_mentioned_p (XEXP (note, 0), memref))
+	    && reg_overlap_mentioned_p (XEXP (note, 0), x))
 	  return valid_none;
     }
 
@@ -3227,7 +3287,7 @@ equiv_init_movable_p (rtx x, int regno)
 
     case REG:
       return ((reg_equiv[REGNO (x)].loop_depth >= reg_equiv[regno].loop_depth
-	       && reg_equiv[REGNO (x)].replace)
+	       && reg_equiv[REGNO (x)].replacement)
 	      || (REG_BASIC_BLOCK (REGNO (x)) < NUM_FIXED_BLOCKS
 		  && ! rtx_varies_p (x, 0)));
 
@@ -3751,8 +3811,8 @@ update_equiv_regs (void)
 	    }
 
 	  /* If this insn introduces a "constant" register, decrease the priority
-	     of that register.  Record this insn if the register is only used once
-	     more and the equivalence value is the same as our source.
+	     of that register.  Record this insn if the register is only used
+	     once more and the equivalence value is the same as our source.
 
 	     The latter condition is checked for two reasons:  First, it is an
 	     indication that it may be more efficient to actually emit the insn
@@ -3761,19 +3821,18 @@ update_equiv_regs (void)
 	     dying in this insn whose death notes would be missed.
 
 	     If we don't have a REG_EQUIV note, see if this insn is loading
-	     a register used only in one basic block from a MEM.  If so, and the
-	     MEM remains unchanged for the life of the register, add a REG_EQUIV
-	     note.  */
+	     a register used only in one basic block.  If so, and the
+	     SET_SRC (set) remains unchanged for the life of the register, add a
+	     REG_EQUIV note.  */
 	  note = find_reg_note (insn, REG_EQUIV, NULL_RTX);
 
 	  rtx replacement = NULL_RTX;
 	  if (note)
 	    replacement = XEXP (note, 0);
-	  else if (REG_BASIC_BLOCK (regno) >= NUM_FIXED_BLOCKS
-		   && MEM_P (SET_SRC (set)))
+	  else if (REG_BASIC_BLOCK (regno) >= NUM_FIXED_BLOCKS)
 	    {
 	      enum valid_equiv validity;
-	      validity = validate_equiv_mem (insn, dest, SET_SRC (set));
+	      validity = validate_equiv (insn, dest, SET_SRC (set));
 	      if (validity != valid_none)
 		{
 		  replacement = copy_rtx (SET_SRC (set));
@@ -3869,7 +3928,7 @@ add_store_equivs (void)
 	  && (init_insn = reg_equiv[regno].init_insns->insn ()) != 0
 	  && bitmap_bit_p (seen_insns, INSN_UID (init_insn))
 	  && ! find_reg_note (init_insn, REG_EQUIV, NULL_RTX)
-	  && validate_equiv_mem (init_insn, src, dest) == valid_reload
+	  && validate_equiv (init_insn, src, dest) == valid_reload
 	  && ! memref_used_between_p (dest, init_insn, insn)
 	  /* Attaching a REG_EQUIV note will fail if INIT_INSN has
 	     multiple sets.  */
diff --git a/gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C b/gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C
new file mode 100644
index 00000000000..f8d200cd4fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-fdump-rtl-final -O3" } */
+
+#include <array>
+#include <cstdint>
+
+struct MyClass
+{
+  std::array<uint64_t, 4> arr;
+};
+
+// Prevent optimization
+void sink (void *m) {
+  asm volatile ("" : : "g"(m) : "memory");
+}
+
+int __attribute__((pure))
+f ();
+
+int
+gg (MyClass&& a)
+{
+  MyClass b;
+  MyClass c = a;
+  int result = f ();
+  b = c;
+  sink (&b);
+  return result;
+}
+
+/* Once for rbx, twice for each xmm register stored to the stack.  */
+/* { dg-final { scan-rtl-dump-times {\(set \(mem} 3 "final" } } */
-- 
2.34.1


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-06-14 11:10 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-14 11:10 [PATCH] Improve optimizer to avoid stack spill across pure function call user202729

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).