public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
@ 2022-03-31  5:51 liuhongt
  2022-03-31 10:44 ` Richard Biener
  0 siblings, 1 reply; 10+ messages in thread
From: liuhongt @ 2022-03-31  5:51 UTC (permalink / raw)
  To: gcc-patches

Since cfg is freed before machine_reorg, just do a rough calculation
of the window according to the layout.
Also according to an experiment on CLX, set window size to 64.

Currently only handle V2DFmode load since it doesn't need any scratch
registers, and it's sufficient to recover cray performance for -O2
compared to GCC11.

Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}.
No impact for SPEC2017(same binary for both O2 and Ofast).
Ok for trunk?

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_split_stlf_stall_load): New
	function
	(ix86_reorg): Call ix86_split_stlf_stall_load.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
---
 gcc/config/i386/i386.cc                    | 47 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 ++++++
 gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 ++++++
 3 files changed, 71 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5a561966eb4..f9169b04d43 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21933,7 +21933,53 @@ ix86_seh_fixup_eh_fallthru (void)
       emit_insn_after (gen_nops (const1_rtx), insn);
     }
 }
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+   stalls.  */
+static void
+ix86_split_stlf_stall_load ()
+{
+  basic_block bb;
+  unsigned window = 0;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+	  window++;
+	  /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+	     other, just emulate for pipeline) before stalled load, stlf stall
+	     case is as fast as no stall cases on CLX.
+	     Since CFG is freed before machine_reorg, just do a rough
+	     calculation of the window according to the layout.  */
+	  if (window > 64)
+	    return;
 
+	  rtx set = single_set (insn);
+	  if (!set)
+	    continue;
+	  rtx src = SET_SRC (set);
+	  if (!MEM_P (src)
+	      /* Only handle V2DFmode load since it doesn't need any scratch
+		 register.  */
+	      || GET_MODE (src) != E_V2DFmode
+	      || !MEM_EXPR (src)
+	      || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+	    continue;
+
+	  rtx zero = CONST0_RTX (V2DFmode);
+	  rtx dest = SET_DEST (set);
+	  rtx m = adjust_address (src, DFmode, 0);
+	  emit_insn_before (gen_sse2_loadlpd (dest, zero, m), insn);
+	  m = adjust_address (src, DFmode, 8);
+	  PATTERN (insn) = gen_sse2_loadhpd (dest, dest, m);
+	  INSN_CODE (insn) = -1;
+	  gcc_assert (recog_memoized (insn) != -1);
+	}
+    }
+
+}
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -21948,6 +21994,7 @@ ix86_reorg (void)
 
   if (optimize && optimize_function_for_speed_p (cfun))
     {
+      ix86_split_stlf_stall_load ();
       if (TARGET_PAD_SHORT_FUNCTION)
 	ix86_pad_short_function ();
       else if (TARGET_PAD_RETURNS)
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..33d9684f0ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..45060b73c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-03-31  5:51 [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls liuhongt
@ 2022-03-31 10:44 ` Richard Biener
  2022-04-01  6:29   ` Hongtao Liu
  0 siblings, 1 reply; 10+ messages in thread
From: Richard Biener @ 2022-03-31 10:44 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches

On Thu, Mar 31, 2022 at 7:51 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Since cfg is freed before machine_reorg, just do a rough calculation
> of the window according to the layout.
> Also according to an experiment on CLX, set window size to 64.
>
> Currently only handle V2DFmode load since it doesn't need any scratch
> registers, and it's sufficient to recover cray performance for -O2
> compared to GCC11.
>
> Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}.
> No impact for SPEC2017(same binary for both O2 and Ofast).
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
>         function
>         (ix86_reorg): Call ix86_split_stlf_stall_load.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 47 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 ++++++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 ++++++
>  3 files changed, 71 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 5a561966eb4..f9169b04d43 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21933,7 +21933,53 @@ ix86_seh_fixup_eh_fallthru (void)
>        emit_insn_after (gen_nops (const1_rtx), insn);
>      }
>  }
> +/* Split vector load from parm_decl to elemental loads to avoid STLF
> +   stalls.  */
> +static void
> +ix86_split_stlf_stall_load ()
> +{
> +  basic_block bb;
> +  unsigned window = 0;
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      rtx_insn *insn;
> +      FOR_BB_INSNS (bb, insn)
> +       {
> +         if (!NONDEBUG_INSN_P (insn))
> +           continue;
> +         window++;
> +         /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> +            other, just emulate for pipeline) before stalled load, stlf stall
> +            case is as fast as no stall cases on CLX.
> +            Since CFG is freed before machine_reorg, just do a rough
> +            calculation of the window according to the layout.  */
> +         if (window > 64)
> +           return;

I wonder if we should also return for any_uncondjump_p (insn)
(not sure if that captures returnjump_p), or maybe just explicitely
allow any_condjump_p and reject other PC setters.

Likewise we might want to stop at a LABEL that can be backwards reached.

I suppose people more familiar with cfgrtl can suggest something better.

> +         rtx set = single_set (insn);
> +         if (!set)
> +           continue;
> +         rtx src = SET_SRC (set);
> +         if (!MEM_P (src)
> +             /* Only handle V2DFmode load since it doesn't need any scratch
> +                register.  */
> +             || GET_MODE (src) != E_V2DFmode
> +             || !MEM_EXPR (src)
> +             || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)

I wonder if we have (easy) ways to detect whether XEXP (src, 0) is
frame/stack based
rather than requiring a MEM_EXPR.  There is may_be_sp_based_p ()
exported from alias.c
for example, but I'm not sure whether that works after RA & frame elimination.

> +           continue;
> +
> +         rtx zero = CONST0_RTX (V2DFmode);
> +         rtx dest = SET_DEST (set);
> +         rtx m = adjust_address (src, DFmode, 0);
> +         emit_insn_before (gen_sse2_loadlpd (dest, zero, m), insn);

Can SSE1 also do this?

> +         m = adjust_address (src, DFmode, 8);
> +         PATTERN (insn) = gen_sse2_loadhpd (dest, dest, m);
> +         INSN_CODE (insn) = -1;
> +         gcc_assert (recog_memoized (insn) != -1);

I think we want to dump something into dump_file when we split an insn here.

> +       }
> +    }
> +
> +}
>  /* Implement machine specific optimizations.  We implement padding of returns
>     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
>  static void
> @@ -21948,6 +21994,7 @@ ix86_reorg (void)
>
>    if (optimize && optimize_function_for_speed_p (cfun))
>      {
> +      ix86_split_stlf_stall_load ();
>        if (TARGET_PAD_SHORT_FUNCTION)
>         ix86_pad_short_function ();

btw. this function suggests we do have edges, so doing something "better"
than FOR_EACH_BB_FN, aka walking blocks in layout order, might be
possible after all.  For example ix86_avoid_jump_mispredicts just walks
the function by looking at get_insns(), that might be more closely what
"as laid out" is.

>        else if (TARGET_PAD_RETURNS)
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..33d9684f0ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..45060b73c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> +
> +struct X { double x[4]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-03-31 10:44 ` Richard Biener
@ 2022-04-01  6:29   ` Hongtao Liu
  2022-04-01  6:46     ` liuhongt
  2022-04-01  6:47     ` [PATCH] " Richard Biener
  0 siblings, 2 replies; 10+ messages in thread
From: Hongtao Liu @ 2022-04-01  6:29 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, GCC Patches

On Thu, Mar 31, 2022 at 6:45 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, Mar 31, 2022 at 7:51 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Since cfg is freed before machine_reorg, just do a rough calculation
> > of the window according to the layout.
> > Also according to an experiment on CLX, set window size to 64.
> >
> > Currently only handle V2DFmode load since it doesn't need any scratch
> > registers, and it's sufficient to recover cray performance for -O2
> > compared to GCC11.
> >
> > Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}.
> > No impact for SPEC2017(same binary for both O2 and Ofast).
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR target/101908
> >         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
> >         function
> >         (ix86_reorg): Call ix86_split_stlf_stall_load.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr101908-1.c: New test.
> >         * gcc.target/i386/pr101908-2.c: New test.
> > ---
> >  gcc/config/i386/i386.cc                    | 47 ++++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 ++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 ++++++
> >  3 files changed, 71 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 5a561966eb4..f9169b04d43 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -21933,7 +21933,53 @@ ix86_seh_fixup_eh_fallthru (void)
> >        emit_insn_after (gen_nops (const1_rtx), insn);
> >      }
> >  }
> > +/* Split vector load from parm_decl to elemental loads to avoid STLF
> > +   stalls.  */
> > +static void
> > +ix86_split_stlf_stall_load ()
> > +{
> > +  basic_block bb;
> > +  unsigned window = 0;
> > +  FOR_EACH_BB_FN (bb, cfun)
> > +    {
> > +      rtx_insn *insn;
> > +      FOR_BB_INSNS (bb, insn)
> > +       {
> > +         if (!NONDEBUG_INSN_P (insn))
> > +           continue;
> > +         window++;
> > +         /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> > +            other, just emulate for pipeline) before stalled load, stlf stall
> > +            case is as fast as no stall cases on CLX.
> > +            Since CFG is freed before machine_reorg, just do a rough
> > +            calculation of the window according to the layout.  */
> > +         if (window > 64)
> > +           return;
>
> I wonder if we should also return for any_uncondjump_p (insn)
> (not sure if that captures returnjump_p), or maybe just explicitely
> allow any_condjump_p and reject other PC setters.
>
I guess it doesn't include call.
> Likewise we might want to stop at a LABEL that can be backwards reached.
>
I think checking load from parm_decl can somehow avoid split load in a
loop(assume optimizer will hoist that out).
> I suppose people more familiar with cfgrtl can suggest something better.
>
> > +         rtx set = single_set (insn);
> > +         if (!set)
> > +           continue;
> > +         rtx src = SET_SRC (set);
> > +         if (!MEM_P (src)
> > +             /* Only handle V2DFmode load since it doesn't need any scratch
> > +                register.  */
> > +             || GET_MODE (src) != E_V2DFmode
> > +             || !MEM_EXPR (src)
> > +             || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
>
> I wonder if we have (easy) ways to detect whether XEXP (src, 0) is
> frame/stack based
> rather than requiring a MEM_EXPR.  There is may_be_sp_based_p ()
may_be_sp_based_p just checks stack pointer which is not suitable after RA.
> exported from alias.c
> for example, but I'm not sure whether that works after RA & frame elimination.
>
> > +           continue;
> > +
> > +         rtx zero = CONST0_RTX (V2DFmode);
> > +         rtx dest = SET_DEST (set);
> > +         rtx m = adjust_address (src, DFmode, 0);
> > +         emit_insn_before (gen_sse2_loadlpd (dest, zero, m), insn);
>
> Can SSE1 also do this?
X86 does have movlps and movhps, but the problem is movlps load 64bit
memory to xmm w/o change upper bits which may cause partial register
dependence.
>
> > +         m = adjust_address (src, DFmode, 8);
> > +         PATTERN (insn) = gen_sse2_loadhpd (dest, dest, m);
> > +         INSN_CODE (insn) = -1;
> > +         gcc_assert (recog_memoized (insn) != -1);
>
> I think we want to dump something into dump_file when we split an insn here.
Good idea.
>
> > +       }
> > +    }
> > +
> > +}
> >  /* Implement machine specific optimizations.  We implement padding of returns
> >     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> >  static void
> > @@ -21948,6 +21994,7 @@ ix86_reorg (void)
> >
> >    if (optimize && optimize_function_for_speed_p (cfun))
> >      {
> > +      ix86_split_stlf_stall_load ();
> >        if (TARGET_PAD_SHORT_FUNCTION)
> >         ix86_pad_short_function ();
>
> btw. this function suggests we do have edges, so doing something "better"
> than FOR_EACH_BB_FN, aka walking blocks in layout order, might be
> possible after all.  For example ix86_avoid_jump_mispredicts just walks
> the function by looking at get_insns(), that might be more closely what
> "as laid out" is.
I prefer to use get_insns ().
>
> >        else if (TARGET_PAD_RETURNS)
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > new file mode 100644
> > index 00000000000..33d9684f0ad
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> > +
> > +struct X { double x[2]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X* x, struct X* y)
> > +{
> > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > new file mode 100644
> > index 00000000000..45060b73c06
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> > +
> > +struct X { double x[4]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X x, struct X y)
> > +{
> > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > +}
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  6:29   ` Hongtao Liu
@ 2022-04-01  6:46     ` liuhongt
  2022-04-01  6:53       ` Richard Biener
  2022-04-01  6:47     ` [PATCH] " Richard Biener
  1 sibling, 1 reply; 10+ messages in thread
From: liuhongt @ 2022-04-01  6:46 UTC (permalink / raw)
  To: gcc-patches

Update in V2:
1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
2. Return for any_uncondjump_p and ANY_RETURN_P.
3. Add dump info for spliting instruction.
4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.

Since cfg is freed before machine_reorg, just do a rough calculation
of the window according to the layout.
Also according to an experiment on CLX, set window size to 64.

Currently only handle V2DFmode load since it doesn't need any scratch
registers, and it's sufficient to recover cray performance for -O2
compared to GCC11.

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_split_stlf_stall_load): New
	function
	(ix86_reorg): Call ix86_split_stlf_stall_load.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
---
 gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
 gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
 3 files changed, 84 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5a561966eb4..c88a689f32b 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
       emit_insn_after (gen_nops (const1_rtx), insn);
     }
 }
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+   stalls.  */
+static void
+ix86_split_stlf_stall_load ()
+{
+  rtx_insn* insn, *start = get_insns ();
+  unsigned window = 0;
+
+  for (insn = start; insn; insn = NEXT_INSN (insn))
+    {
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+      window++;
+      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+	 other, just emulate for pipeline) before stalled load, stlf stall
+	 case is as fast as no stall cases on CLX.
+	 Since CFG is freed before machine_reorg, just do a rough
+	 calculation of the window according to the layout.  */
+      if (window > 64)
+	return;
+
+      if (any_uncondjump_p (insn)
+	  || ANY_RETURN_P (PATTERN (insn)))
+	return;
+
+      rtx set = single_set (insn);
+      if (!set)
+	continue;
+      rtx src = SET_SRC (set);
+      if (!MEM_P (src)
+	  /* Only handle V2DFmode load since it doesn't need any scratch
+	     register.  */
+	  || GET_MODE (src) != E_V2DFmode
+	  || !MEM_EXPR (src)
+	  || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+	continue;
+
+      rtx zero = CONST0_RTX (V2DFmode);
+      rtx dest = SET_DEST (set);
+      rtx m = adjust_address (src, DFmode, 0);
+      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
+      emit_insn_before (loadlpd, insn);
+      m = adjust_address (src, DFmode, 8);
+      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fputs ("Due to potential STLF stall, split instruction:\n",
+		 dump_file);
+	  print_rtl_single (dump_file, insn);
+	  fputs ("To:\n", dump_file);
+	  print_rtl_single (dump_file, loadlpd);
+	  print_rtl_single (dump_file, loadhpd);
+	}
+      PATTERN (insn) = loadhpd;
+      INSN_CODE (insn) = -1;
+      gcc_assert (recog_memoized (insn) != -1);
+    }
+}
 
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
@@ -21948,6 +22006,8 @@ ix86_reorg (void)
 
   if (optimize && optimize_function_for_speed_p (cfun))
     {
+      if (TARGET_SSE2)
+	ix86_split_stlf_stall_load ();
       if (TARGET_PAD_SHORT_FUNCTION)
 	ix86_pad_short_function ();
       else if (TARGET_PAD_RETURNS)
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..33d9684f0ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..45060b73c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  6:29   ` Hongtao Liu
  2022-04-01  6:46     ` liuhongt
@ 2022-04-01  6:47     ` Richard Biener
  1 sibling, 0 replies; 10+ messages in thread
From: Richard Biener @ 2022-04-01  6:47 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, GCC Patches

On Fri, Apr 1, 2022 at 8:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Mar 31, 2022 at 6:45 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Thu, Mar 31, 2022 at 7:51 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > Since cfg is freed before machine_reorg, just do a rough calculation
> > > of the window according to the layout.
> > > Also according to an experiment on CLX, set window size to 64.
> > >
> > > Currently only handle V2DFmode load since it doesn't need any scratch
> > > registers, and it's sufficient to recover cray performance for -O2
> > > compared to GCC11.
> > >
> > > Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}.
> > > No impact for SPEC2017(same binary for both O2 and Ofast).
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/101908
> > >         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
> > >         function
> > >         (ix86_reorg): Call ix86_split_stlf_stall_load.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr101908-1.c: New test.
> > >         * gcc.target/i386/pr101908-2.c: New test.
> > > ---
> > >  gcc/config/i386/i386.cc                    | 47 ++++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 ++++++
> > >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 ++++++
> > >  3 files changed, 71 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index 5a561966eb4..f9169b04d43 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -21933,7 +21933,53 @@ ix86_seh_fixup_eh_fallthru (void)
> > >        emit_insn_after (gen_nops (const1_rtx), insn);
> > >      }
> > >  }
> > > +/* Split vector load from parm_decl to elemental loads to avoid STLF
> > > +   stalls.  */
> > > +static void
> > > +ix86_split_stlf_stall_load ()
> > > +{
> > > +  basic_block bb;
> > > +  unsigned window = 0;
> > > +  FOR_EACH_BB_FN (bb, cfun)
> > > +    {
> > > +      rtx_insn *insn;
> > > +      FOR_BB_INSNS (bb, insn)
> > > +       {
> > > +         if (!NONDEBUG_INSN_P (insn))
> > > +           continue;
> > > +         window++;
> > > +         /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> > > +            other, just emulate for pipeline) before stalled load, stlf stall
> > > +            case is as fast as no stall cases on CLX.
> > > +            Since CFG is freed before machine_reorg, just do a rough
> > > +            calculation of the window according to the layout.  */
> > > +         if (window > 64)
> > > +           return;
> >
> > I wonder if we should also return for any_uncondjump_p (insn)
> > (not sure if that captures returnjump_p), or maybe just explicitely
> > allow any_condjump_p and reject other PC setters.
> >
> I guess it doesn't include call.
> > Likewise we might want to stop at a LABEL that can be backwards reached.
> >
> I think checking load from parm_decl can somehow avoid split load in a
> loop(assume optimizer will hoist that out).
> > I suppose people more familiar with cfgrtl can suggest something better.
> >
> > > +         rtx set = single_set (insn);
> > > +         if (!set)
> > > +           continue;
> > > +         rtx src = SET_SRC (set);
> > > +         if (!MEM_P (src)
> > > +             /* Only handle V2DFmode load since it doesn't need any scratch
> > > +                register.  */
> > > +             || GET_MODE (src) != E_V2DFmode
> > > +             || !MEM_EXPR (src)
> > > +             || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
> >
> > I wonder if we have (easy) ways to detect whether XEXP (src, 0) is
> > frame/stack based
> > rather than requiring a MEM_EXPR.  There is may_be_sp_based_p ()
> may_be_sp_based_p just checks stack pointer which is not suitable after RA.
> > exported from alias.c
> > for example, but I'm not sure whether that works after RA & frame elimination.
> >
> > > +           continue;
> > > +
> > > +         rtx zero = CONST0_RTX (V2DFmode);
> > > +         rtx dest = SET_DEST (set);
> > > +         rtx m = adjust_address (src, DFmode, 0);
> > > +         emit_insn_before (gen_sse2_loadlpd (dest, zero, m), insn);
> >
> > Can SSE1 also do this?
> X86 does have movlps and movhps, but the problem is movlps load 64bit
> memory to xmm w/o change upper bits which may cause partial register
> dependence.

So do we need to guard this transform on SSE2 availability then?

> >
> > > +         m = adjust_address (src, DFmode, 8);
> > > +         PATTERN (insn) = gen_sse2_loadhpd (dest, dest, m);
> > > +         INSN_CODE (insn) = -1;
> > > +         gcc_assert (recog_memoized (insn) != -1);
> >
> > I think we want to dump something into dump_file when we split an insn here.
> Good idea.
> >
> > > +       }
> > > +    }
> > > +
> > > +}
> > >  /* Implement machine specific optimizations.  We implement padding of returns
> > >     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> > >  static void
> > > @@ -21948,6 +21994,7 @@ ix86_reorg (void)
> > >
> > >    if (optimize && optimize_function_for_speed_p (cfun))
> > >      {
> > > +      ix86_split_stlf_stall_load ();
> > >        if (TARGET_PAD_SHORT_FUNCTION)
> > >         ix86_pad_short_function ();
> >
> > btw. this function suggests we do have edges, so doing something "better"
> > than FOR_EACH_BB_FN, aka walking blocks in layout order, might be
> > possible after all.  For example ix86_avoid_jump_mispredicts just walks
> > the function by looking at get_insns(), that might be more closely what
> > "as laid out" is.
> I prefer to use get_insns ().
> >
> > >        else if (TARGET_PAD_RETURNS)
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > > new file mode 100644
> > > index 00000000000..33d9684f0ad
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > > +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> > > +
> > > +struct X { double x[2]; };
> > > +typedef double v2df __attribute__((vector_size(16)));
> > > +
> > > +v2df __attribute__((noipa))
> > > +foo (struct X* x, struct X* y)
> > > +{
> > > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > > new file mode 100644
> > > index 00000000000..45060b73c06
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > > +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> > > +
> > > +struct X { double x[4]; };
> > > +typedef double v2df __attribute__((vector_size(16)));
> > > +
> > > +v2df __attribute__((noipa))
> > > +foo (struct X x, struct X y)
> > > +{
> > > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > > +}
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  6:46     ` liuhongt
@ 2022-04-01  6:53       ` Richard Biener
  2022-04-01  7:14         ` Hongtao Liu
  0 siblings, 1 reply; 10+ messages in thread
From: Richard Biener @ 2022-04-01  6:53 UTC (permalink / raw)
  To: liuhongt, Jan Hubicka; +Cc: GCC Patches

On Fri, Apr 1, 2022 at 8:47 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Update in V2:
> 1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
> 2. Return for any_uncondjump_p and ANY_RETURN_P.
> 3. Add dump info for spliting instruction.
> 4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.
>
> Since cfg is freed before machine_reorg, just do a rough calculation
> of the window according to the layout.
> Also according to an experiment on CLX, set window size to 64.
>
> Currently only handle V2DFmode load since it doesn't need any scratch
> registers, and it's sufficient to recover cray performance for -O2
> compared to GCC11.
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
>         function
>         (ix86_reorg): Call ix86_split_stlf_stall_load.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
>  3 files changed, 84 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 5a561966eb4..c88a689f32b 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
>        emit_insn_after (gen_nops (const1_rtx), insn);
>      }
>  }
> +/* Split vector load from parm_decl to elemental loads to avoid STLF
> +   stalls.  */
> +static void
> +ix86_split_stlf_stall_load ()
> +{
> +  rtx_insn* insn, *start = get_insns ();
> +  unsigned window = 0;
> +
> +  for (insn = start; insn; insn = NEXT_INSN (insn))
> +    {
> +      if (!NONDEBUG_INSN_P (insn))
> +       continue;
> +      window++;
> +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> +        other, just emulate for pipeline) before stalled load, stlf stall
> +        case is as fast as no stall cases on CLX.
> +        Since CFG is freed before machine_reorg, just do a rough
> +        calculation of the window according to the layout.  */
> +      if (window > 64)

I think we want to turn the '64' into a --param at least.  You can add

-param=x86-stlf-window-ninsns=

into i386.opt (see -param= examples in aarch64/ for example).

> +       return;
> +
> +      if (any_uncondjump_p (insn)
> +         || ANY_RETURN_P (PATTERN (insn)))

You made a point about calls - does any_uncondjump_p cover them?

otherwise I think this is fine, Honza, do you agree?

Thanks,
Richard.

> +       return;
> +
> +      rtx set = single_set (insn);
> +      if (!set)
> +       continue;
> +      rtx src = SET_SRC (set);
> +      if (!MEM_P (src)
> +         /* Only handle V2DFmode load since it doesn't need any scratch
> +            register.  */
> +         || GET_MODE (src) != E_V2DFmode
> +         || !MEM_EXPR (src)
> +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL
> +       continue;
> +
> +      rtx zero = CONST0_RTX (V2DFmode);
> +      rtx dest = SET_DEST (set);
> +      rtx m = adjust_address (src, DFmode, 0);
> +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> +      emit_insn_before (loadlpd, insn);
> +      m = adjust_address (src, DFmode, 8);
> +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fputs ("Due to potential STLF stall, split instruction:\n",
> +                dump_file);
> +         print_rtl_single (dump_file, insn);
> +         fputs ("To:\n", dump_file);
> +         print_rtl_single (dump_file, loadlpd);
> +         print_rtl_single (dump_file, loadhpd);
> +       }
> +      PATTERN (insn) = loadhpd;
> +      INSN_CODE (insn) = -1;
> +      gcc_assert (recog_memoized (insn) != -1);
> +    }
> +}
>
>  /* Implement machine specific optimizations.  We implement padding of returns
>     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> @@ -21948,6 +22006,8 @@ ix86_reorg (void)
>
>    if (optimize && optimize_function_for_speed_p (cfun))
>      {
> +      if (TARGET_SSE2)
> +       ix86_split_stlf_stall_load ();
>        if (TARGET_PAD_SHORT_FUNCTION)
>         ix86_pad_short_function ();
>        else if (TARGET_PAD_RETURNS)
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..33d9684f0ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..45060b73c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> +
> +struct X { double x[4]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  6:53       ` Richard Biener
@ 2022-04-01  7:14         ` Hongtao Liu
  2022-04-01  7:20           ` Richard Biener
  0 siblings, 1 reply; 10+ messages in thread
From: Hongtao Liu @ 2022-04-01  7:14 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, Jan Hubicka, GCC Patches

On Fri, Apr 1, 2022 at 2:54 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Apr 1, 2022 at 8:47 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Update in V2:
> > 1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
> > 2. Return for any_uncondjump_p and ANY_RETURN_P.
> > 3. Add dump info for spliting instruction.
> > 4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.
> >
> > Since cfg is freed before machine_reorg, just do a rough calculation
> > of the window according to the layout.
> > Also according to an experiment on CLX, set window size to 64.
> >
> > Currently only handle V2DFmode load since it doesn't need any scratch
> > registers, and it's sufficient to recover cray performance for -O2
> > compared to GCC11.
> >
> > gcc/ChangeLog:
> >
> >         PR target/101908
> >         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
> >         function
> >         (ix86_reorg): Call ix86_split_stlf_stall_load.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr101908-1.c: New test.
> >         * gcc.target/i386/pr101908-2.c: New test.
> > ---
> >  gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
> >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
> >  3 files changed, 84 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 5a561966eb4..c88a689f32b 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
> >        emit_insn_after (gen_nops (const1_rtx), insn);
> >      }
> >  }
> > +/* Split vector load from parm_decl to elemental loads to avoid STLF
> > +   stalls.  */
> > +static void
> > +ix86_split_stlf_stall_load ()
> > +{
> > +  rtx_insn* insn, *start = get_insns ();
> > +  unsigned window = 0;
> > +
> > +  for (insn = start; insn; insn = NEXT_INSN (insn))
> > +    {
> > +      if (!NONDEBUG_INSN_P (insn))
> > +       continue;
> > +      window++;
> > +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> > +        other, just emulate for pipeline) before stalled load, stlf stall
> > +        case is as fast as no stall cases on CLX.
> > +        Since CFG is freed before machine_reorg, just do a rough
> > +        calculation of the window according to the layout.  */
> > +      if (window > 64)
>
> I think we want to turn the '64' into a --param at least.  You can add
>
> -param=x86-stlf-window-ninsns=
>
> into i386.opt (see -param= examples in aarch64/ for example).
Sure.
>
> > +       return;
> > +
> > +      if (any_uncondjump_p (insn)
> > +         || ANY_RETURN_P (PATTERN (insn)))
>
> You made a point about calls - does any_uncondjump_p cover them?
>
No, I prefer excluding calls which could take sufficient time to
compensate for the STLF stall.
> otherwise I think this is fine, Honza, do you agree?
>
> Thanks,
> Richard.
>
> > +       return;
> > +
> > +      rtx set = single_set (insn);
> > +      if (!set)
> > +       continue;
> > +      rtx src = SET_SRC (set);
> > +      if (!MEM_P (src)
> > +         /* Only handle V2DFmode load since it doesn't need any scratch
> > +            register.  */
> > +         || GET_MODE (src) != E_V2DFmode
> > +         || !MEM_EXPR (src)
> > +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL
> > +       continue;
> > +
> > +      rtx zero = CONST0_RTX (V2DFmode);
> > +      rtx dest = SET_DEST (set);
> > +      rtx m = adjust_address (src, DFmode, 0);
> > +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> > +      emit_insn_before (loadlpd, insn);
> > +      m = adjust_address (src, DFmode, 8);
> > +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fputs ("Due to potential STLF stall, split instruction:\n",
> > +                dump_file);
> > +         print_rtl_single (dump_file, insn);
> > +         fputs ("To:\n", dump_file);
> > +         print_rtl_single (dump_file, loadlpd);
> > +         print_rtl_single (dump_file, loadhpd);
> > +       }
> > +      PATTERN (insn) = loadhpd;
> > +      INSN_CODE (insn) = -1;
> > +      gcc_assert (recog_memoized (insn) != -1);
> > +    }
> > +}
> >
> >  /* Implement machine specific optimizations.  We implement padding of returns
> >     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> > @@ -21948,6 +22006,8 @@ ix86_reorg (void)
> >
> >    if (optimize && optimize_function_for_speed_p (cfun))
> >      {
> > +      if (TARGET_SSE2)
> > +       ix86_split_stlf_stall_load ();
> >        if (TARGET_PAD_SHORT_FUNCTION)
> >         ix86_pad_short_function ();
> >        else if (TARGET_PAD_RETURNS)
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > new file mode 100644
> > index 00000000000..33d9684f0ad
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> > +
> > +struct X { double x[2]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X* x, struct X* y)
> > +{
> > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > new file mode 100644
> > index 00000000000..45060b73c06
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> > +
> > +struct X { double x[4]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X x, struct X y)
> > +{
> > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > +}
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  7:14         ` Hongtao Liu
@ 2022-04-01  7:20           ` Richard Biener
  2022-04-01  7:51             ` [PATCH V3] " liuhongt
  0 siblings, 1 reply; 10+ messages in thread
From: Richard Biener @ 2022-04-01  7:20 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, Jan Hubicka, GCC Patches

On Fri, Apr 1, 2022 at 9:14 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Apr 1, 2022 at 2:54 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Fri, Apr 1, 2022 at 8:47 AM liuhongt via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Update in V2:
> > > 1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
> > > 2. Return for any_uncondjump_p and ANY_RETURN_P.
> > > 3. Add dump info for spliting instruction.
> > > 4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.
> > >
> > > Since cfg is freed before machine_reorg, just do a rough calculation
> > > of the window according to the layout.
> > > Also according to an experiment on CLX, set window size to 64.
> > >
> > > Currently only handle V2DFmode load since it doesn't need any scratch
> > > registers, and it's sufficient to recover cray performance for -O2
> > > compared to GCC11.
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/101908
> > >         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
> > >         function
> > >         (ix86_reorg): Call ix86_split_stlf_stall_load.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr101908-1.c: New test.
> > >         * gcc.target/i386/pr101908-2.c: New test.
> > > ---
> > >  gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
> > >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
> > >  3 files changed, 84 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index 5a561966eb4..c88a689f32b 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
> > >        emit_insn_after (gen_nops (const1_rtx), insn);
> > >      }
> > >  }
> > > +/* Split vector load from parm_decl to elemental loads to avoid STLF
> > > +   stalls.  */
> > > +static void
> > > +ix86_split_stlf_stall_load ()
> > > +{
> > > +  rtx_insn* insn, *start = get_insns ();
> > > +  unsigned window = 0;
> > > +
> > > +  for (insn = start; insn; insn = NEXT_INSN (insn))
> > > +    {
> > > +      if (!NONDEBUG_INSN_P (insn))
> > > +       continue;
> > > +      window++;
> > > +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> > > +        other, just emulate for pipeline) before stalled load, stlf stall
> > > +        case is as fast as no stall cases on CLX.
> > > +        Since CFG is freed before machine_reorg, just do a rough
> > > +        calculation of the window according to the layout.  */
> > > +      if (window > 64)
> >
> > I think we want to turn the '64' into a --param at least.  You can add
> >
> > -param=x86-stlf-window-ninsns=
> >
> > into i386.opt (see -param= examples in aarch64/ for example).
> Sure.
> >
> > > +       return;
> > > +
> > > +      if (any_uncondjump_p (insn)
> > > +         || ANY_RETURN_P (PATTERN (insn)))
> >
> > You made a point about calls - does any_uncondjump_p cover them?
> >
> No, I prefer excluding calls which could take sufficient time to
> compensate for the STLF stall.

So I guess CALL_P (insn) could check for them, I agree we can stop looking
at calls.

> > otherwise I think this is fine, Honza, do you agree?
> >
> > Thanks,
> > Richard.
> >
> > > +       return;
> > > +
> > > +      rtx set = single_set (insn);
> > > +      if (!set)
> > > +       continue;
> > > +      rtx src = SET_SRC (set);
> > > +      if (!MEM_P (src)
> > > +         /* Only handle V2DFmode load since it doesn't need any scratch
> > > +            register.  */
> > > +         || GET_MODE (src) != E_V2DFmode
> > > +         || !MEM_EXPR (src)
> > > +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL
> > > +       continue;
> > > +
> > > +      rtx zero = CONST0_RTX (V2DFmode);
> > > +      rtx dest = SET_DEST (set);
> > > +      rtx m = adjust_address (src, DFmode, 0);
> > > +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> > > +      emit_insn_before (loadlpd, insn);
> > > +      m = adjust_address (src, DFmode, 8);
> > > +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> > > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > > +       {
> > > +         fputs ("Due to potential STLF stall, split instruction:\n",
> > > +                dump_file);
> > > +         print_rtl_single (dump_file, insn);
> > > +         fputs ("To:\n", dump_file);
> > > +         print_rtl_single (dump_file, loadlpd);
> > > +         print_rtl_single (dump_file, loadhpd);
> > > +       }
> > > +      PATTERN (insn) = loadhpd;
> > > +      INSN_CODE (insn) = -1;
> > > +      gcc_assert (recog_memoized (insn) != -1);
> > > +    }
> > > +}
> > >
> > >  /* Implement machine specific optimizations.  We implement padding of returns
> > >     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> > > @@ -21948,6 +22006,8 @@ ix86_reorg (void)
> > >
> > >    if (optimize && optimize_function_for_speed_p (cfun))
> > >      {
> > > +      if (TARGET_SSE2)
> > > +       ix86_split_stlf_stall_load ();
> > >        if (TARGET_PAD_SHORT_FUNCTION)
> > >         ix86_pad_short_function ();
> > >        else if (TARGET_PAD_RETURNS)
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > > new file mode 100644
> > > index 00000000000..33d9684f0ad
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > > +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> > > +
> > > +struct X { double x[2]; };
> > > +typedef double v2df __attribute__((vector_size(16)));
> > > +
> > > +v2df __attribute__((noipa))
> > > +foo (struct X* x, struct X* y)
> > > +{
> > > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > > new file mode 100644
> > > index 00000000000..45060b73c06
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > > +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> > > +
> > > +struct X { double x[4]; };
> > > +typedef double v2df __attribute__((vector_size(16)));
> > > +
> > > +v2df __attribute__((noipa))
> > > +foo (struct X x, struct X y)
> > > +{
> > > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > > +}
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH V3] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  7:20           ` Richard Biener
@ 2022-04-01  7:51             ` liuhongt
  2022-04-04 11:47               ` Hongtao Liu
  0 siblings, 1 reply; 10+ messages in thread
From: liuhongt @ 2022-04-01  7:51 UTC (permalink / raw)
  To: gcc-patches

Update in V3:
1. Add -param=x86-stlf-window-ninsns= (default 64).
2. Exclude call in the window.

Since cfg is freed before machine_reorg, just do a rough calculation
of the window according to the layout.
Also according to an experiment on CLX, set window size to 64.

Currently only handle V2DFmode load since it doesn't need any scratch
registers, and it's sufficient to recover cray performance for -O2
compared to GCC11.

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_split_stlf_stall_load): New
	function
	(ix86_reorg): Call ix86_split_stlf_stall_load.
	* config/i386/i386.opt (-param=x86-stlf-window-ninsns=): New
	param.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
	* gcc.target/i386/pr101908-3.c: New test.
---
 gcc/config/i386/i386.cc                    | 61 ++++++++++++++++++++++
 gcc/config/i386/i386.opt                   |  4 ++
 gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
 gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
 gcc/testsuite/gcc.target/i386/pr101908-3.c | 14 +++++
 5 files changed, 103 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5a561966eb4..3f8a2c7932d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21933,6 +21933,65 @@ ix86_seh_fixup_eh_fallthru (void)
       emit_insn_after (gen_nops (const1_rtx), insn);
     }
 }
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+   stalls.  */
+static void
+ix86_split_stlf_stall_load ()
+{
+  rtx_insn* insn, *start = get_insns ();
+  unsigned window = 0;
+
+  for (insn = start; insn; insn = NEXT_INSN (insn))
+    {
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+      window++;
+      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+	 other, just emulate for pipeline) before stalled load, stlf stall
+	 case is as fast as no stall cases on CLX.
+	 Since CFG is freed before machine_reorg, just do a rough
+	 calculation of the window according to the layout.  */
+      if (window > (unsigned) x86_stlf_window_ninsns)
+	return;
+
+      if (any_uncondjump_p (insn)
+	  || ANY_RETURN_P (PATTERN (insn))
+	  || CALL_P (insn))
+	return;
+
+      rtx set = single_set (insn);
+      if (!set)
+	continue;
+      rtx src = SET_SRC (set);
+      if (!MEM_P (src)
+	  /* Only handle V2DFmode load since it doesn't need any scratch
+	     register.  */
+	  || GET_MODE (src) != E_V2DFmode
+	  || !MEM_EXPR (src)
+	  || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+	continue;
+
+      rtx zero = CONST0_RTX (V2DFmode);
+      rtx dest = SET_DEST (set);
+      rtx m = adjust_address (src, DFmode, 0);
+      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
+      emit_insn_before (loadlpd, insn);
+      m = adjust_address (src, DFmode, 8);
+      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fputs ("Due to potential STLF stall, split instruction:\n",
+		 dump_file);
+	  print_rtl_single (dump_file, insn);
+	  fputs ("To:\n", dump_file);
+	  print_rtl_single (dump_file, loadlpd);
+	  print_rtl_single (dump_file, loadhpd);
+	}
+      PATTERN (insn) = loadhpd;
+      INSN_CODE (insn) = -1;
+      gcc_assert (recog_memoized (insn) != -1);
+    }
+}
 
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
@@ -21948,6 +22007,8 @@ ix86_reorg (void)
 
   if (optimize && optimize_function_for_speed_p (cfun))
     {
+      if (TARGET_SSE2)
+	ix86_split_stlf_stall_load ();
       if (TARGET_PAD_SHORT_FUNCTION)
 	ix86_pad_short_function ();
       else if (TARGET_PAD_RETURNS)
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index d8e8656a8ab..a6b0e28f238 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1210,3 +1210,7 @@ Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX5
 mdirect-extern-access
 Target Var(ix86_direct_extern_access) Init(1)
 Do not use GOT to access external symbols.
+
+-param=x86-stlf-window-ninsns=
+Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
+Instructions number above which STFL stall penalty can be compensated.
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..33d9684f0ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..45060b73c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
new file mode 100644
index 00000000000..ddd3e8eff33
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]+} } }  */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+extern void bar (void);
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  bar ();
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V3] Split vector load from parm_del to elemental loads to avoid STLF stalls.
  2022-04-01  7:51             ` [PATCH V3] " liuhongt
@ 2022-04-04 11:47               ` Hongtao Liu
  0 siblings, 0 replies; 10+ messages in thread
From: Hongtao Liu @ 2022-04-04 11:47 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches

On Fri, Apr 1, 2022 at 4:32 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Update in V3:
> 1. Add -param=x86-stlf-window-ninsns= (default 64).
> 2. Exclude call in the window.
>
> Since cfg is freed before machine_reorg, just do a rough calculation
> of the window according to the layout.
> Also according to an experiment on CLX, set window size to 64.
>
> Currently only handle V2DFmode load since it doesn't need any scratch
> registers, and it's sufficient to recover cray performance for -O2
> compared to GCC11.

I'm going to check in the patch.
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
>         function
>         (ix86_reorg): Call ix86_split_stlf_stall_load.
>         * config/i386/i386.opt (-param=x86-stlf-window-ninsns=): New
>         param.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
>         * gcc.target/i386/pr101908-3.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 61 ++++++++++++++++++++++
>  gcc/config/i386/i386.opt                   |  4 ++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
>  gcc/testsuite/gcc.target/i386/pr101908-3.c | 14 +++++
>  5 files changed, 103 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 5a561966eb4..3f8a2c7932d 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21933,6 +21933,65 @@ ix86_seh_fixup_eh_fallthru (void)
>        emit_insn_after (gen_nops (const1_rtx), insn);
>      }
>  }
> +/* Split vector load from parm_decl to elemental loads to avoid STLF
> +   stalls.  */
> +static void
> +ix86_split_stlf_stall_load ()
> +{
> +  rtx_insn* insn, *start = get_insns ();
> +  unsigned window = 0;
> +
> +  for (insn = start; insn; insn = NEXT_INSN (insn))
> +    {
> +      if (!NONDEBUG_INSN_P (insn))
> +       continue;
> +      window++;
> +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> +        other, just emulate for pipeline) before stalled load, stlf stall
> +        case is as fast as no stall cases on CLX.
> +        Since CFG is freed before machine_reorg, just do a rough
> +        calculation of the window according to the layout.  */
> +      if (window > (unsigned) x86_stlf_window_ninsns)
> +       return;
> +
> +      if (any_uncondjump_p (insn)
> +         || ANY_RETURN_P (PATTERN (insn))
> +         || CALL_P (insn))
> +       return;
> +
> +      rtx set = single_set (insn);
> +      if (!set)
> +       continue;
> +      rtx src = SET_SRC (set);
> +      if (!MEM_P (src)
> +         /* Only handle V2DFmode load since it doesn't need any scratch
> +            register.  */
> +         || GET_MODE (src) != E_V2DFmode
> +         || !MEM_EXPR (src)
> +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
> +       continue;
> +
> +      rtx zero = CONST0_RTX (V2DFmode);
> +      rtx dest = SET_DEST (set);
> +      rtx m = adjust_address (src, DFmode, 0);
> +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> +      emit_insn_before (loadlpd, insn);
> +      m = adjust_address (src, DFmode, 8);
> +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fputs ("Due to potential STLF stall, split instruction:\n",
> +                dump_file);
> +         print_rtl_single (dump_file, insn);
> +         fputs ("To:\n", dump_file);
> +         print_rtl_single (dump_file, loadlpd);
> +         print_rtl_single (dump_file, loadhpd);
> +       }
> +      PATTERN (insn) = loadhpd;
> +      INSN_CODE (insn) = -1;
> +      gcc_assert (recog_memoized (insn) != -1);
> +    }
> +}
>
>  /* Implement machine specific optimizations.  We implement padding of returns
>     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> @@ -21948,6 +22007,8 @@ ix86_reorg (void)
>
>    if (optimize && optimize_function_for_speed_p (cfun))
>      {
> +      if (TARGET_SSE2)
> +       ix86_split_stlf_stall_load ();
>        if (TARGET_PAD_SHORT_FUNCTION)
>         ix86_pad_short_function ();
>        else if (TARGET_PAD_RETURNS)
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index d8e8656a8ab..a6b0e28f238 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1210,3 +1210,7 @@ Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX5
>  mdirect-extern-access
>  Target Var(ix86_direct_extern_access) Init(1)
>  Do not use GOT to access external symbols.
> +
> +-param=x86-stlf-window-ninsns=
> +Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
> +Instructions number above which STFL stall penalty can be compensated.
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..33d9684f0ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..45060b73c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> +
> +struct X { double x[4]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
> new file mode 100644
> index 00000000000..ddd3e8eff33
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]+} } }  */
> +
> +struct X { double x[4]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +extern void bar (void);
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  bar ();
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> --
> 2.18.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-04-04 11:47 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-31  5:51 [PATCH] Split vector load from parm_del to elemental loads to avoid STLF stalls liuhongt
2022-03-31 10:44 ` Richard Biener
2022-04-01  6:29   ` Hongtao Liu
2022-04-01  6:46     ` liuhongt
2022-04-01  6:53       ` Richard Biener
2022-04-01  7:14         ` Hongtao Liu
2022-04-01  7:20           ` Richard Biener
2022-04-01  7:51             ` [PATCH V3] " liuhongt
2022-04-04 11:47               ` Hongtao Liu
2022-04-01  6:47     ` [PATCH] " Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).