public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: liuhongt <hongtao.liu@intel.com>
To: gcc-patches@gcc.gnu.org
Subject: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
Date: Thu, 31 Mar 2022 13:51:17 +0800	[thread overview]
Message-ID: <20220331055117.6942-1-hongtao.liu@intel.com> (raw)

Since cfg is freed before machine_reorg, just do a rough calculation
of the window according to the layout.
Also according to an experiment on CLX, set window size to 64.

Currently only handle V2DFmode load since it doesn't need any scratch
registers, and it's sufficient to recover cray performance for -O2
compared to GCC11.

Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}.
No impact for SPEC2017(same binary for both O2 and Ofast).
Ok for trunk?

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_split_stlf_stall_load): New
	function
	(ix86_reorg): Call ix86_split_stlf_stall_load.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
---
 gcc/config/i386/i386.cc                    | 47 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 ++++++
 gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 ++++++
 3 files changed, 71 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5a561966eb4..f9169b04d43 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21933,7 +21933,53 @@ ix86_seh_fixup_eh_fallthru (void)
       emit_insn_after (gen_nops (const1_rtx), insn);
     }
 }
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+   stalls.  */
+static void
+ix86_split_stlf_stall_load ()
+{
+  basic_block bb;
+  unsigned window = 0;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+	  window++;
+	  /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+	     other, just emulate for pipeline) before stalled load, stlf stall
+	     case is as fast as no stall cases on CLX.
+	     Since CFG is freed before machine_reorg, just do a rough
+	     calculation of the window according to the layout.  */
+	  if (window > 64)
+	    return;
 
+	  rtx set = single_set (insn);
+	  if (!set)
+	    continue;
+	  rtx src = SET_SRC (set);
+	  if (!MEM_P (src)
+	      /* Only handle V2DFmode load since it doesn't need any scratch
+		 register.  */
+	      || GET_MODE (src) != E_V2DFmode
+	      || !MEM_EXPR (src)
+	      || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+	    continue;
+
+	  rtx zero = CONST0_RTX (V2DFmode);
+	  rtx dest = SET_DEST (set);
+	  rtx m = adjust_address (src, DFmode, 0);
+	  emit_insn_before (gen_sse2_loadlpd (dest, zero, m), insn);
+	  m = adjust_address (src, DFmode, 8);
+	  PATTERN (insn) = gen_sse2_loadhpd (dest, dest, m);
+	  INSN_CODE (insn) = -1;
+	  gcc_assert (recog_memoized (insn) != -1);
+	}
+    }
+
+}
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -21948,6 +21994,7 @@ ix86_reorg (void)
 
   if (optimize && optimize_function_for_speed_p (cfun))
     {
+      ix86_split_stlf_stall_load ();
       if (TARGET_PAD_SHORT_FUNCTION)
 	ix86_pad_short_function ();
       else if (TARGET_PAD_RETURNS)
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..33d9684f0ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..45060b73c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
-- 
2.18.1


             reply	other threads:[~2022-03-31  5:51 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-31  5:51 liuhongt [this message]
2022-03-31 10:44 ` Richard Biener
2022-04-01  6:29   ` Hongtao Liu
2022-04-01  6:46     ` liuhongt
2022-04-01  6:53       ` Richard Biener
2022-04-01  7:14         ` Hongtao Liu
2022-04-01  7:20           ` Richard Biener
2022-04-01  7:51             ` [PATCH V3] " liuhongt
2022-04-04 11:47               ` Hongtao Liu
2022-04-01  6:47     ` [PATCH] " Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220331055117.6942-1-hongtao.liu@intel.com \
    --to=hongtao.liu@intel.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).