public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
@ 2022-03-04  7:27 liuhongt
  2022-03-07  8:26 ` Hongtao Liu
  2022-03-07  9:37 ` Richard Biener
  0 siblings, 2 replies; 9+ messages in thread
From: liuhongt @ 2022-03-04  7:27 UTC (permalink / raw)
  To: gcc-patches

For parameter passing through stack, vectorized load from parm_decl
in callee may trigger serious STF issue. This is why GCC12 regresses
50% for cray at -O2 compared to GCC11.

The patch add an extremely large number to stmt_cost to prevent
vectorization for loads from parm_decl under very-cheap cost model,
this can at least prevent O2 regression due to STF issue, but may lose
some perf where there's no such issue(1 vector_load vs n scalar_load +
CTOR).

No impact for SPEC2017 for both plain O2 and native O2 on ICX.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
	(ix86_vector_costs::add_stmt_cost): Add extra cost for
	vector_load/unsigned_load which may have stall forward issue.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
---
 gcc/config/i386/i386.cc                    | 31 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++++++
 gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++++++
 3 files changed, 55 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b2bf90576d5..3bbaaf65ea8 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22976,6 +22976,19 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
+/* Return true if REF may have STF issue, otherwise false.  */
+static bool
+ix86_load_maybe_stfs_p (tree ref)
+{
+  tree addr = get_base_address (ref);
+
+  if (TREE_CODE (addr) != PARM_DECL
+      || !tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (addr)))
+      || tree_to_uhwi (TYPE_SIZE (TREE_TYPE (addr))) <= MAX_BITS_PER_WORD)
+    return false;
+  return true;
+}
+
 /* x86-specific vector costs.  */
 class ix86_vector_costs : public vector_costs
 {
@@ -23203,6 +23216,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	if (TREE_CODE (op) == SSA_NAME)
 	  TREE_VISITED (op) = 0;
     }
+
+  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
+     Performance may lose when there's no STF issue(1 vector_load vs n
+     scalar_load + CTOR).
+     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
+     tuned.  */
+  if ((kind == vector_load || kind == unaligned_load)
+      && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && stmt_info
+      && stmt_info->slp_type == pure_slp
+      && stmt_info->stmt
+      && gimple_assign_load_p (stmt_info->stmt)
+      && ix86_load_maybe_stfs_p (gimple_assign_rhs1 (stmt_info->stmt)))
+    {
+      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+      stmt_cost += 2000;
+    }
+
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..f8e0f2e26bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..7f2f00cebab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
  2022-03-04  7:27 [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue liuhongt
@ 2022-03-07  8:26 ` Hongtao Liu
  2022-03-07  9:37 ` Richard Biener
  1 sibling, 0 replies; 9+ messages in thread
From: Hongtao Liu @ 2022-03-07  8:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

On Fri, Mar 4, 2022 at 3:28 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> For parameter passing through stack, vectorized load from parm_decl
> in callee may trigger serious STF issue. This is why GCC12 regresses
> 50% for cray at -O2 compared to GCC11.
>
> The patch add an extremely large number to stmt_cost to prevent
> vectorization for loads from parm_decl under very-cheap cost model,
> this can at least prevent O2 regression due to STF issue, but may lose
> some perf where there's no such issue(1 vector_load vs n scalar_load +
> CTOR).
>
> No impact for SPEC2017 for both plain O2 and native O2 on ICX.
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
>         (ix86_vector_costs::add_stmt_cost): Add extra cost for
>         vector_load/unsigned_load which may have stall forward issue.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 31 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++++++
>  3 files changed, 55 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index b2bf90576d5..3bbaaf65ea8 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -22976,6 +22976,19 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
>    return default_noce_conversion_profitable_p (seq, if_info);
>  }
>
> +/* Return true if REF may have STF issue, otherwise false.  */
> +static bool
> +ix86_load_maybe_stfs_p (tree ref)
> +{
> +  tree addr = get_base_address (ref);
> +
> +  if (TREE_CODE (addr) != PARM_DECL
> +      || !tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (addr)))
> +      || tree_to_uhwi (TYPE_SIZE (TREE_TYPE (addr))) <= MAX_BITS_PER_WORD)
> +    return false;
> +  return true;
> +}
> +
>  /* x86-specific vector costs.  */
>  class ix86_vector_costs : public vector_costs
>  {
> @@ -23203,6 +23216,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
>         if (TREE_CODE (op) == SSA_NAME)
>           TREE_VISITED (op) = 0;
>      }
> +
> +  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
> +     Performance may lose when there's no STF issue(1 vector_load vs n
> +     scalar_load + CTOR).
> +     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
> +     tuned.  */
> +  if ((kind == vector_load || kind == unaligned_load)
> +      && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
> +      && stmt_info
> +      && stmt_info->slp_type == pure_slp
> +      && stmt_info->stmt
> +      && gimple_assign_load_p (stmt_info->stmt)
> +      && ix86_load_maybe_stfs_p (gimple_assign_rhs1 (stmt_info->stmt)))
> +    {
> +      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
> +      stmt_cost += 2000;
> +    }
> +
>    if (stmt_cost == -1)
>      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..f8e0f2e26bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..7f2f00cebab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> --
> 2.18.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
  2022-03-04  7:27 [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue liuhongt
  2022-03-07  8:26 ` Hongtao Liu
@ 2022-03-07  9:37 ` Richard Biener
  2022-03-08  3:39   ` Hongtao Liu
  1 sibling, 1 reply; 9+ messages in thread
From: Richard Biener @ 2022-03-07  9:37 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches

On Fri, Mar 4, 2022 at 8:27 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> For parameter passing through stack, vectorized load from parm_decl
> in callee may trigger serious STF issue. This is why GCC12 regresses
> 50% for cray at -O2 compared to GCC11.
>
> The patch add an extremely large number to stmt_cost to prevent
> vectorization for loads from parm_decl under very-cheap cost model,
> this can at least prevent O2 regression due to STF issue, but may lose
> some perf where there's no such issue(1 vector_load vs n scalar_load +
> CTOR).

Note this is just heuristics in that by-value passed parameters are usually
stored to the stack close before the function call.  It does not catch the
similar case from

  foo (const X &bar)  { ... }

where a

  foo ({ 1., 2. })

will have the object passed by reference constructed right before the
call.  In the end a full solution will need to perform some IPA analysis
that computes the initialization distance from the call and uses should
factor in the use distance from function entry.

> No impact for SPEC2017 for both plain O2 and native O2 on ICX.
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
>         (ix86_vector_costs::add_stmt_cost): Add extra cost for
>         vector_load/unsigned_load which may have stall forward issue.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 31 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++++++
>  3 files changed, 55 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index b2bf90576d5..3bbaaf65ea8 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -22976,6 +22976,19 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
>    return default_noce_conversion_profitable_p (seq, if_info);
>  }
>
> +/* Return true if REF may have STF issue, otherwise false.  */
> +static bool
> +ix86_load_maybe_stfs_p (tree ref)
> +{
> +  tree addr = get_base_address (ref);
> +
> +  if (TREE_CODE (addr) != PARM_DECL
> +      || !tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (addr)))
> +      || tree_to_uhwi (TYPE_SIZE (TREE_TYPE (addr))) <= MAX_BITS_PER_WORD)
> +    return false;
> +  return true;
> +}
> +
>  /* x86-specific vector costs.  */
>  class ix86_vector_costs : public vector_costs
>  {
> @@ -23203,6 +23216,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
>         if (TREE_CODE (op) == SSA_NAME)
>           TREE_VISITED (op) = 0;
>      }
> +
> +  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
> +     Performance may lose when there's no STF issue(1 vector_load vs n
> +     scalar_load + CTOR).
> +     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
> +     tuned.  */
> +  if ((kind == vector_load || kind == unaligned_load)
> +      && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP

This check doesn't make much sense, I'd rather remove it.

> +      && stmt_info
> +      && stmt_info->slp_type == pure_slp
> +      && stmt_info->stmt
> +      && gimple_assign_load_p (stmt_info->stmt)
> +      && ix86_load_maybe_stfs_p (gimple_assign_rhs1 (stmt_info->stmt)))

I'd pass down STMT_VINFO_DATA_REF instead and have ix86_load_maybe_stfs_p
and use

  tree addr = DR_BASE_ADDRESS (dr);
  if (TREE_CODE (addr) != ADDR_EXPR)
    return false;
  addr = get_base_address (TREE_OPERAND (addr, 0));
  ...

since that gets you a more reliable way to look at the actual object referenced.

> +    {
> +      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
> +      stmt_cost += 2000;
> +    }
> +

Maybe handle this like the Bonell case and thus put it after the
stmt_cost == -1 handling, just bumping the cost (also noting the actual number
is arbitrary).  It would be nice to have a better estimate on the penalty than
"2000", maybe formulate it in terms of the target costs simple-sse op at least.

That said, it might be interesting to micro-benchmark

v2df __attribute__((noipa))
foo (struct X* x, struct X* y)
{
  double temx0, temx1, temy0, temy1;
  temx0 = x->x[0];
  temx0 += temx0;
 ...
  temx1 = x->x[1];
  temx1 += temx1;
...
  return (v2df) {temx1, temx0 } + (v2df) { temy1, temy0 };
}

(without -ffast-math) to see how many vector adds we'd need to compensate the
STLF penalty (just to have an idea whether the magic number is closer to 200,
2000 or 20000).  Maybe also put that respective kernel into the i386 testsuite
with a specific -mtune (and make the thing a target tunable?).

>    if (stmt_cost == -1)
>      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..f8e0f2e26bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..7f2f00cebab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
  2022-03-07  9:37 ` Richard Biener
@ 2022-03-08  3:39   ` Hongtao Liu
  2022-03-15  9:13     ` [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue liuhongt
                       ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Hongtao Liu @ 2022-03-08  3:39 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, GCC Patches

On Mon, Mar 7, 2022 at 5:37 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Mar 4, 2022 at 8:27 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > For parameter passing through stack, vectorized load from parm_decl
> > in callee may trigger serious STF issue. This is why GCC12 regresses
> > 50% for cray at -O2 compared to GCC11.
> >
> > The patch add an extremely large number to stmt_cost to prevent
> > vectorization for loads from parm_decl under very-cheap cost model,
> > this can at least prevent O2 regression due to STF issue, but may lose
> > some perf where there's no such issue(1 vector_load vs n scalar_load +
> > CTOR).
>
> Note this is just heuristics in that by-value passed parameters are usually
> stored to the stack close before the function call.  It does not catch the
> similar case from
>
>   foo (const X &bar)  { ... }
>
> where a
>
>   foo ({ 1., 2. })
>
> will have the object passed by reference constructed right before the
> call.  In the end a full solution will need to perform some IPA analysis
> that computes the initialization distance from the call and uses should
> factor in the use distance from function entry.
Yes, this patch only deals with by-value passed objects which has
nothing to do with ipa, but only depends on psABI.
For the case of passing references, IPA is necessary to help analyze
some obvious STFS scenarios, but static analysis still has
limitations, for example, for the store across the cache line (or
page) boundary case, IPA can not give a judgment.
>
> > No impact for SPEC2017 for both plain O2 and native O2 on ICX.
> > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR target/101908
> >         * config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
> >         (ix86_vector_costs::add_stmt_cost): Add extra cost for
> >         vector_load/unsigned_load which may have stall forward issue.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr101908-1.c: New test.
> >         * gcc.target/i386/pr101908-2.c: New test.
> > ---
> >  gcc/config/i386/i386.cc                    | 31 ++++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++++++
> >  3 files changed, 55 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index b2bf90576d5..3bbaaf65ea8 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -22976,6 +22976,19 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
> >    return default_noce_conversion_profitable_p (seq, if_info);
> >  }
> >
> > +/* Return true if REF may have STF issue, otherwise false.  */
> > +static bool
> > +ix86_load_maybe_stfs_p (tree ref)
> > +{
> > +  tree addr = get_base_address (ref);
> > +
> > +  if (TREE_CODE (addr) != PARM_DECL
> > +      || !tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (addr)))
> > +      || tree_to_uhwi (TYPE_SIZE (TREE_TYPE (addr))) <= MAX_BITS_PER_WORD)
> > +    return false;
> > +  return true;
> > +}
> > +
> >  /* x86-specific vector costs.  */
> >  class ix86_vector_costs : public vector_costs
> >  {
> > @@ -23203,6 +23216,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
> >         if (TREE_CODE (op) == SSA_NAME)
> >           TREE_VISITED (op) = 0;
> >      }
> > +
> > +  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
> > +     Performance may lose when there's no STF issue(1 vector_load vs n
> > +     scalar_load + CTOR).
> > +     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
> > +     tuned.  */
> > +  if ((kind == vector_load || kind == unaligned_load)
> > +      && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
>
> This check doesn't make much sense, I'd rather remove it.
Will change.
>
> > +      && stmt_info
> > +      && stmt_info->slp_type == pure_slp
> > +      && stmt_info->stmt
> > +      && gimple_assign_load_p (stmt_info->stmt)
> > +      && ix86_load_maybe_stfs_p (gimple_assign_rhs1 (stmt_info->stmt)))
>
> I'd pass down STMT_VINFO_DATA_REF instead and have ix86_load_maybe_stfs_p
> and use
>
>   tree addr = DR_BASE_ADDRESS (dr);
>   if (TREE_CODE (addr) != ADDR_EXPR)
>     return false;
>   addr = get_base_address (TREE_OPERAND (addr, 0));
>   ...
>
> since that gets you a more reliable way to look at the actual object referenced.
Yes.
>
> > +    {
> > +      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
> > +      stmt_cost += 2000;
> > +    }
> > +
>
> Maybe handle this like the Bonell case and thus put it after the
> stmt_cost == -1 handling, just bumping the cost (also noting the actual number
> is arbitrary).  It would be nice to have a better estimate on the penalty than
> "2000", maybe formulate it in terms of the target costs simple-sse op at least.
>
I'll add a member to ix86_cost for STFS to make it target specific,
the initial value will come from [1] and real experiments.

[1] https://www.agner.org/optimize/microarchitecture.pdf

> That said, it might be interesting to micro-benchmark
>
> v2df __attribute__((noipa))
> foo (struct X* x, struct X* y)
> {
>   double temx0, temx1, temy0, temy1;
>   temx0 = x->x[0];
>   temx0 += temx0;
>  ...
>   temx1 = x->x[1];
>   temx1 += temx1;
> ...
>   return (v2df) {temx1, temx0 } + (v2df) { temy1, temy0 };
> }
>
> (without -ffast-math) to see how many vector adds we'd need to compensate the
> STLF penalty (just to have an idea whether the magic number is closer to 200,
> 2000 or 20000).  Maybe also put that respective kernel into the i386 testsuite
> with a specific -mtune (and make the thing a target tunable?).
>
> >    if (stmt_cost == -1)
> >      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > new file mode 100644
> > index 00000000000..f8e0f2e26bb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> > +
> > +struct X { double x[2]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X* x, struct X* y)
> > +{
> > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > new file mode 100644
> > index 00000000000..7f2f00cebab
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> > +
> > +struct X { double x[2]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X x, struct X y)
> > +{
> > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > +}
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue.
  2022-03-08  3:39   ` Hongtao Liu
@ 2022-03-15  9:13     ` liuhongt
  2022-03-16  1:03     ` liuhongt
  2022-03-16  2:19     ` liuhongt
  2 siblings, 0 replies; 9+ messages in thread
From: liuhongt @ 2022-03-15  9:13 UTC (permalink / raw)
  To: gcc-patches

This patch only handle pure-slp for by-value passed parameter which
has nothing to do with IPA but psABI. For by-reference passed
parameter IPA is required.

The patch is aggressive in determining STLF failure, any
unaligned_load for parm_decl passed by stack is thought to have STLF
stall issue. It could lose some perf where there's no such issue(1
vector_load vs n scalar_load + CTOR).

According to microbenchmark in PR, cost of STLF failure is generally
between 8 scalar_loads and 16 scalar loads on most latest Intel/AMD
processors.

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
	(ix86_vector_costs::add_stmt_cost): Add extra cost for
	unsigned_load which may have store forwarding stall issue.
	* config/i386/i386.h (processor_costs): Add new member
	stfs.
	* config/i386/x86-tune-costs.h (i386_size_cost): Initialize
	stfs.
	(i386_cost, i486_cost, pentium_cost, lakemont_cost,
	pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost,
	amdfam10_cost, bdver_cost, znver1_cost, znver2_cost,
	znver3_cost, skylake_cost, icelake_cost, alderlake_cost,
	btver1_cost, btver2_cost, pentium4_cost, nocano_cost,
	atom_cost, slm_cost, tremont_cost, intel_cost, generic_cost,
	core_cost): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
	* gcc.target/i386/pr101908-3.c: New test.
	* gcc.target/i386/pr101908-v16hi.c: New test.
	* gcc.target/i386/pr101908-v16qi.c: New test.
	* gcc.target/i386/pr101908-v16sf.c: New test.
	* gcc.target/i386/pr101908-v16si.c: New test.
	* gcc.target/i386/pr101908-v2df.c: New test.
	* gcc.target/i386/pr101908-v2di.c: New test.
	* gcc.target/i386/pr101908-v2hi.c: New test.
	* gcc.target/i386/pr101908-v2qi.c: New test.
	* gcc.target/i386/pr101908-v2sf.c: New test.
	* gcc.target/i386/pr101908-v2si.c: New test.
	* gcc.target/i386/pr101908-v4df.c: New test.
	* gcc.target/i386/pr101908-v4di.c: New test.
	* gcc.target/i386/pr101908-v4hi.c: New test.
	* gcc.target/i386/pr101908-v4qi.c: New test.
	* gcc.target/i386/pr101908-v4sf.c: New test.
	* gcc.target/i386/pr101908-v4si.c: New test.
	* gcc.target/i386/pr101908-v8df-adl.c: New test.
	* gcc.target/i386/pr101908-v8df.c: New test.
	* gcc.target/i386/pr101908-v8di-adl.c: New test.
	* gcc.target/i386/pr101908-v8di.c: New test.
	* gcc.target/i386/pr101908-v8hi-adl.c: New test.
	* gcc.target/i386/pr101908-v8hi.c: New test.
	* gcc.target/i386/pr101908-v8qi-adl.c: New test.
	* gcc.target/i386/pr101908-v8qi.c: New test.
	* gcc.target/i386/pr101908-v8sf-adl.c: New test.
	* gcc.target/i386/pr101908-v8sf.c: New test.
	* gcc.target/i386/pr101908-v8si-adl.c: New test.
	* gcc.target/i386/pr101908-v8si.c: New test.
---
 gcc/config/i386/i386.cc                       | 51 +++++++++++
 gcc/config/i386/i386.h                        |  1 +
 gcc/config/i386/x86-tune-costs.h              | 28 ++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c    | 12 +++
 gcc/testsuite/gcc.target/i386/pr101908-2.c    | 12 +++
 gcc/testsuite/gcc.target/i386/pr101908-3.c    | 90 +++++++++++++++++++
 .../gcc.target/i386/pr101908-v16hi.c          |  6 ++
 .../gcc.target/i386/pr101908-v16qi.c          | 30 +++++++
 .../gcc.target/i386/pr101908-v16sf.c          |  6 ++
 .../gcc.target/i386/pr101908-v16si.c          |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2df.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2di.c |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2si.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4df.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4di.c |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4si.c |  6 ++
 .../gcc.target/i386/pr101908-v8df-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8df.c |  6 ++
 .../gcc.target/i386/pr101908-v8di-adl.c       |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8di.c |  7 ++
 .../gcc.target/i386/pr101908-v8hi-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c |  6 ++
 .../gcc.target/i386/pr101908-v8qi-adl.c       | 22 +++++
 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c | 22 +++++
 .../gcc.target/i386/pr101908-v8sf-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c |  6 ++
 .../gcc.target/i386/pr101908-v8si-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8si.c |  6 ++
 34 files changed, 444 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d77ad83e437..c01809cc3da 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22988,6 +22988,46 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
+/* Return true if REF may have STF issue, otherwise false.
+   Any unaligned_load from parm_decl which is passed by stack
+   is considered to have STLF stall issue.  */
+static bool
+ix86_load_maybe_stfs_p (data_reference* dr)
+{
+  tree addr = DR_BASE_ADDRESS (dr);
+  if (TREE_CODE (addr) != ADDR_EXPR)
+    return false;
+  addr = get_base_address (TREE_OPERAND (addr, 0));
+
+  if (TREE_CODE (addr) != PARM_DECL)
+    return false;
+  tree type = TREE_TYPE (addr);
+  if (!type)
+    return false;
+
+  machine_mode mode = TYPE_MODE (type);
+
+  /* There could be false positive in determine parameter passed by stack.
+     .i.e. parameter can be put in registers but finally passed by stack
+     because registers are ran out.  */
+  if (TARGET_64BIT)
+    {
+      /* From function_arg_64.  */
+      enum x86_64_reg_class regclass[MAX_CLASSES];
+      int zero_width_bitfields = 0;
+      return !classify_argument (mode, type, regclass, 0, zero_width_bitfields);
+    }
+  else
+    {
+      /* From function_arg_32.  */
+      return (mode == E_BLKmode
+	      || (AGGREGATE_TYPE_P (type)
+		  && (VECTOR_MODE_P (mode) || mode == TImode)));
+    }
+
+  return false;
+}
+
 /* x86-specific vector costs.  */
 class ix86_vector_costs : public vector_costs
 {
@@ -23218,6 +23258,17 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
+     Performance may lose when there's no STF issue(1 vector_load vs n
+     scalar_load + CTOR).
+     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
+     tuned.  */
+  if (kind == unaligned_load && stmt_info
+      && stmt_info->slp_type == pure_slp
+      && STMT_VINFO_DATA_REF (stmt_info)
+      && ix86_load_maybe_stfs_p (STMT_VINFO_DATA_REF (stmt_info)))
+    stmt_cost += COSTS_N_INSNS (ix86_cost->stfs / 2);
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
       && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0d28e57f8f2..341f1c47981 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -168,6 +168,7 @@ struct processor_costs {
 				   in 32bit, 64bit, 128bit, 256bit and 512bit */
   const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_unaligned_store[5];/* cost of unaligned store.  */
+  const int stfs;		 /* cost of store forward stalls.  */
   const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
 	    zmm_move;
   const int sse_to_integer;	/* cost of moving SSE register to integer.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 017ffa69958..3a5fcdeefdd 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -100,6 +100,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
 					   in 128bit, 256bit and 512bit */
   {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
+  6,					/* cost of store forward stall.  */
   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   5, 0,					/* Gather load static, per_elt.  */
@@ -209,6 +210,7 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -317,6 +319,7 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -427,6 +430,7 @@ struct processor_costs pentium_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -528,6 +532,7 @@ struct processor_costs lakemont_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -644,6 +649,7 @@ struct processor_costs pentiumpro_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  24,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -751,6 +757,7 @@ struct processor_costs geode_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
@@ -858,6 +865,7 @@ struct processor_costs k6_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  24,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
@@ -971,6 +979,7 @@ struct processor_costs athlon_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1086,6 +1095,7 @@ struct processor_costs k8_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1214,6 +1224,7 @@ struct processor_costs amdfam10_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  21,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1334,6 +1345,7 @@ const struct processor_costs bdver_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
+  54,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   16,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
@@ -1475,6 +1487,7 @@ struct processor_costs znver1_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
   6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
@@ -1630,6 +1643,7 @@ struct processor_costs znver2_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
@@ -1762,6 +1776,7 @@ struct processor_costs znver3_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
@@ -1907,6 +1922,7 @@ struct processor_costs skylake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
@@ -2033,6 +2049,7 @@ struct processor_costs icelake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
@@ -2153,6 +2170,7 @@ struct processor_costs alderlake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  90,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -2266,6 +2284,7 @@ const struct processor_costs btver1_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
+  36,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
@@ -2376,6 +2395,7 @@ const struct processor_costs btver2_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
+  36,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
@@ -2485,6 +2505,7 @@ struct processor_costs pentium4_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
+  10,					/* cost of store forward stall.  */
   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
   16, 16,				/* Gather load static, per_elt.  */
@@ -2597,6 +2618,7 @@ struct processor_costs nocona_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
@@ -2707,6 +2729,7 @@ struct processor_costs atom_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  32,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
@@ -2817,6 +2840,7 @@ struct processor_costs slm_cost = {
 					   in SImode, DImode and TImode.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  48,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
@@ -2939,6 +2963,7 @@ struct processor_costs tremont_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  42,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3051,6 +3076,7 @@ struct processor_costs intel_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
+  22,					/* cost of store forward stall.  */
   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
   4,					/* cost of moving SSE register to integer.  */
   6, 6,					/* Gather load static, per_elt.  */
@@ -3168,6 +3194,7 @@ struct processor_costs generic_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  54,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3291,6 +3318,7 @@ struct processor_costs core_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   2,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..f8e0f2e26bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..f4ff7a83c82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
new file mode 100644
index 00000000000..6f853aa7750
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
@@ -0,0 +1,90 @@
+/* PR target/101908.  */
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -O2 -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not "add new stmt:.*MEM \<vector(2) double\>.*ray + 24B" "slp2" } }  */
+/* This testcase is used to avoid STLF stall.  */
+
+#define sqrt __builtin_sqrt
+#define SQ(x)		((x) * (x))
+struct vec3 {
+  double x, y, z;
+};
+
+struct ray {
+  struct vec3 orig, dir;
+};
+
+struct material {
+  struct vec3 col;	/* color */
+  double spow;		/* specular power */
+  double refl;		/* reflection intensity */
+};
+
+struct sphere {
+  struct vec3 pos;
+  double rad;
+  struct material mat;
+  struct sphere *next;
+};
+
+struct spoint {
+  struct vec3 pos, normal, vref;	/* position, normal and view reflection */
+  double dist;		/* parametric distance of intersection along the ray */
+};
+
+#define ERR_MARGIN		1e-6
+
+#define DOT(a, b)	((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
+#define NORMALIZE(a)  do {			\
+    double len = sqrt(DOT(a, a));		\
+    (a).x /= len; (a).y /= len; (a).z /= len;	\
+  } while(0);
+
+static struct vec3
+reflect(struct vec3 v, struct vec3 n) {
+  struct vec3 res;
+  double dot = v.x * n.x + v.y * n.y + v.z * n.z;
+  res.x = -(2.0 * dot * n.x - v.x);
+  res.y = -(2.0 * dot * n.y - v.y);
+  res.z = -(2.0 * dot * n.z - v.z);
+  return res;
+}
+
+int ray_sphere(const struct sphere *sph,
+	       struct ray ray, struct spoint *sp) {
+  double a, b, c, d, sqrt_d, t1, t2;
+	
+  a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
+  b = 2.0 * ray.dir.x * (ray.orig.x - sph->pos.x) +
+    2.0 * ray.dir.y * (ray.orig.y - sph->pos.y) +
+    2.0 * ray.dir.z * (ray.orig.z - sph->pos.z);
+  c = SQ(sph->pos.x) + SQ(sph->pos.y) + SQ(sph->pos.z) +
+    SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
+    2.0 * (-sph->pos.x * ray.orig.x - sph->pos.y * ray.orig.y - sph->pos.z * ray.orig.z) - SQ(sph->rad);
+	
+  if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
+
+  sqrt_d = sqrt(d);
+  t1 = (-b + sqrt_d) / (2.0 * a);
+  t2 = (-b - sqrt_d) / (2.0 * a);
+
+  if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
+
+  if(sp) {
+    if(t1 < ERR_MARGIN) t1 = t2;
+    if(t2 < ERR_MARGIN) t2 = t1;
+    sp->dist = t1 < t2 ? t1 : t2;
+		
+    sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
+    sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
+    sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
+		
+    sp->normal.x = (sp->pos.x - sph->pos.x) / sph->rad;
+    sp->normal.y = (sp->pos.y - sph->pos.y) / sph->rad;
+    sp->normal.z = (sp->pos.z - sph->pos.z) / sph->rad;
+
+    sp->vref = reflect(ray.dir, sp->normal);
+    NORMALIZE(sp->vref);
+  }
+  return 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
new file mode 100644
index 00000000000..fcd3ee8122f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
new file mode 100644
index 00000000000..6d43788600e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O3  -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[0] = x.a[1] + y.a[1];
+  p[1] = x.a[2] + y.a[2];
+  p[2] = x.a[3] + y.a[3];
+  p[3] = x.a[4] + y.a[4];
+  p[4] = x.a[5] + y.a[5];
+  p[5] = x.a[6] + y.a[6];
+  p[6] = x.a[7] + y.a[7];
+  p[7] = x.a[8] + y.a[8];
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
new file mode 100644
index 00000000000..f95b85abbc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16si.c b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
new file mode 100644
index 00000000000..5c48aa5da69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2df.c b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
new file mode 100644
index 00000000000..9d3f157718c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2di.c b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
new file mode 100644
index 00000000000..c7cf9a71f21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
new file mode 100644
index 00000000000..e6024d70780
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
new file mode 100644
index 00000000000..cf876cc70d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
new file mode 100644
index 00000000000..eb6349b957e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2si.c b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
new file mode 100644
index 00000000000..ae5fa0749c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4df.c b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
new file mode 100644
index 00000000000..94497422704
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4di.c b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
new file mode 100644
index 00000000000..71407aa9fc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
new file mode 100644
index 00000000000..4b207b91225
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
new file mode 100644
index 00000000000..5292d3442ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
new file mode 100644
index 00000000000..a2c6273120d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4si.c b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
new file mode 100644
index 00000000000..c6824285c74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
new file mode 100644
index 00000000000..248c6d0fb91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
new file mode 100644
index 00000000000..05eb2dd51d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
new file mode 100644
index 00000000000..b0055d7d2c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
new file mode 100644
index 00000000000..76a393bcc6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
new file mode 100644
index 00000000000..28977adae28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi-adl.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
new file mode 100644
index 00000000000..89b50885366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
new file mode 100644
index 00000000000..be668e5d006
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
new file mode 100644
index 00000000000..842c88c8952
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
new file mode 100644
index 00000000000..89d33566a40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
new file mode 100644
index 00000000000..81557c7b9b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
new file mode 100644
index 00000000000..883956a0d49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi-adl.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
new file mode 100644
index 00000000000..142f46012d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi.c"
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue.
  2022-03-08  3:39   ` Hongtao Liu
  2022-03-15  9:13     ` [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue liuhongt
@ 2022-03-16  1:03     ` liuhongt
  2022-03-16  2:19     ` liuhongt
  2 siblings, 0 replies; 9+ messages in thread
From: liuhongt @ 2022-03-16  1:03 UTC (permalink / raw)
  To: gcc-patches

This patch only handle pure-slp for by-value passed parameter which
has nothing to do with IPA but psABI. For by-reference passed
parameter IPA is required.

The patch is aggressive in determining STLF failure, any
unaligned_load for parm_decl passed by stack is thought to have STLF
stall issue. It could lose some perf where there's no such issue(1
vector_load vs n scalar_load + CTOR).

According to microbenchmark in PR, cost of STLF failure is generally
between 8 scalar_loads and 16 scalar loads on most latest Intel/AMD
processors.

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
	(ix86_vector_costs::add_stmt_cost): Add extra cost for
	unsigned_load which may have store forwarding stall issue.
	* config/i386/i386.h (processor_costs): Add new member
	stfs.
	* config/i386/x86-tune-costs.h (i386_size_cost): Initialize
	stfs.
	(i386_cost, i486_cost, pentium_cost, lakemont_cost,
	pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost,
	amdfam10_cost, bdver_cost, znver1_cost, znver2_cost,
	znver3_cost, skylake_cost, icelake_cost, alderlake_cost,
	btver1_cost, btver2_cost, pentium4_cost, nocano_cost,
	atom_cost, slm_cost, tremont_cost, intel_cost, generic_cost,
	core_cost): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
	* gcc.target/i386/pr101908-3.c: New test.
	* gcc.target/i386/pr101908-v16hi.c: New test.
	* gcc.target/i386/pr101908-v16qi.c: New test.
	* gcc.target/i386/pr101908-v16sf.c: New test.
	* gcc.target/i386/pr101908-v16si.c: New test.
	* gcc.target/i386/pr101908-v2df.c: New test.
	* gcc.target/i386/pr101908-v2di.c: New test.
	* gcc.target/i386/pr101908-v2hi.c: New test.
	* gcc.target/i386/pr101908-v2qi.c: New test.
	* gcc.target/i386/pr101908-v2sf.c: New test.
	* gcc.target/i386/pr101908-v2si.c: New test.
	* gcc.target/i386/pr101908-v4df.c: New test.
	* gcc.target/i386/pr101908-v4di.c: New test.
	* gcc.target/i386/pr101908-v4hi.c: New test.
	* gcc.target/i386/pr101908-v4qi.c: New test.
	* gcc.target/i386/pr101908-v4sf.c: New test.
	* gcc.target/i386/pr101908-v4si.c: New test.
	* gcc.target/i386/pr101908-v8df-adl.c: New test.
	* gcc.target/i386/pr101908-v8df.c: New test.
	* gcc.target/i386/pr101908-v8di-adl.c: New test.
	* gcc.target/i386/pr101908-v8di.c: New test.
	* gcc.target/i386/pr101908-v8hi-adl.c: New test.
	* gcc.target/i386/pr101908-v8hi.c: New test.
	* gcc.target/i386/pr101908-v8qi-adl.c: New test.
	* gcc.target/i386/pr101908-v8qi.c: New test.
	* gcc.target/i386/pr101908-v8sf-adl.c: New test.
	* gcc.target/i386/pr101908-v8sf.c: New test.
	* gcc.target/i386/pr101908-v8si-adl.c: New test.
	* gcc.target/i386/pr101908-v8si.c: New test.
---
 gcc/config/i386/i386.cc                       | 51 +++++++++++
 gcc/config/i386/i386.h                        |  1 +
 gcc/config/i386/x86-tune-costs.h              | 28 ++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c    | 12 +++
 gcc/testsuite/gcc.target/i386/pr101908-2.c    | 12 +++
 gcc/testsuite/gcc.target/i386/pr101908-3.c    | 90 +++++++++++++++++++
 .../gcc.target/i386/pr101908-v16hi.c          |  6 ++
 .../gcc.target/i386/pr101908-v16qi.c          | 30 +++++++
 .../gcc.target/i386/pr101908-v16sf.c          |  6 ++
 .../gcc.target/i386/pr101908-v16si.c          |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2df.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2di.c |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2si.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4df.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4di.c |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4si.c |  6 ++
 .../gcc.target/i386/pr101908-v8df-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8df.c |  6 ++
 .../gcc.target/i386/pr101908-v8di-adl.c       |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8di.c |  7 ++
 .../gcc.target/i386/pr101908-v8hi-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c |  6 ++
 .../gcc.target/i386/pr101908-v8qi-adl.c       | 22 +++++
 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c | 22 +++++
 .../gcc.target/i386/pr101908-v8sf-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c |  6 ++
 .../gcc.target/i386/pr101908-v8si-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8si.c |  6 ++
 34 files changed, 444 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d77ad83e437..c01809cc3da 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22988,6 +22988,46 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
+/* Return true if REF may have STF issue, otherwise false.
+   Any unaligned_load from parm_decl which is passed by stack
+   is considered to have STLF stall issue.  */
+static bool
+ix86_load_maybe_stfs_p (data_reference* dr)
+{
+  tree addr = DR_BASE_ADDRESS (dr);
+  if (TREE_CODE (addr) != ADDR_EXPR)
+    return false;
+  addr = get_base_address (TREE_OPERAND (addr, 0));
+
+  if (TREE_CODE (addr) != PARM_DECL)
+    return false;
+  tree type = TREE_TYPE (addr);
+  if (!type)
+    return false;
+
+  machine_mode mode = TYPE_MODE (type);
+
+  /* There could be false positive in determine parameter passed by stack.
+     .i.e. parameter can be put in registers but finally passed by stack
+     because registers are ran out.  */
+  if (TARGET_64BIT)
+    {
+      /* From function_arg_64.  */
+      enum x86_64_reg_class regclass[MAX_CLASSES];
+      int zero_width_bitfields = 0;
+      return !classify_argument (mode, type, regclass, 0, zero_width_bitfields);
+    }
+  else
+    {
+      /* From function_arg_32.  */
+      return (mode == E_BLKmode
+	      || (AGGREGATE_TYPE_P (type)
+		  && (VECTOR_MODE_P (mode) || mode == TImode)));
+    }
+
+  return false;
+}
+
 /* x86-specific vector costs.  */
 class ix86_vector_costs : public vector_costs
 {
@@ -23218,6 +23258,17 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
+     Performance may lose when there's no STF issue(1 vector_load vs n
+     scalar_load + CTOR).
+     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
+     tuned.  */
+  if (kind == unaligned_load && stmt_info
+      && stmt_info->slp_type == pure_slp
+      && STMT_VINFO_DATA_REF (stmt_info)
+      && ix86_load_maybe_stfs_p (STMT_VINFO_DATA_REF (stmt_info)))
+    stmt_cost += COSTS_N_INSNS (ix86_cost->stfs / 2);
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
       && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0d28e57f8f2..341f1c47981 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -168,6 +168,7 @@ struct processor_costs {
 				   in 32bit, 64bit, 128bit, 256bit and 512bit */
   const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_unaligned_store[5];/* cost of unaligned store.  */
+  const int stfs;		 /* cost of store forward stalls.  */
   const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
 	    zmm_move;
   const int sse_to_integer;	/* cost of moving SSE register to integer.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 017ffa69958..3a5fcdeefdd 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -100,6 +100,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
 					   in 128bit, 256bit and 512bit */
   {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
+  6,					/* cost of store forward stall.  */
   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   5, 0,					/* Gather load static, per_elt.  */
@@ -209,6 +210,7 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -317,6 +319,7 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -427,6 +430,7 @@ struct processor_costs pentium_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -528,6 +532,7 @@ struct processor_costs lakemont_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -644,6 +649,7 @@ struct processor_costs pentiumpro_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  24,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -751,6 +757,7 @@ struct processor_costs geode_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
@@ -858,6 +865,7 @@ struct processor_costs k6_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  24,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
@@ -971,6 +979,7 @@ struct processor_costs athlon_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1086,6 +1095,7 @@ struct processor_costs k8_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1214,6 +1224,7 @@ struct processor_costs amdfam10_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  21,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1334,6 +1345,7 @@ const struct processor_costs bdver_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
+  54,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   16,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
@@ -1475,6 +1487,7 @@ struct processor_costs znver1_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
   6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
@@ -1630,6 +1643,7 @@ struct processor_costs znver2_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
@@ -1762,6 +1776,7 @@ struct processor_costs znver3_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
@@ -1907,6 +1922,7 @@ struct processor_costs skylake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
@@ -2033,6 +2049,7 @@ struct processor_costs icelake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
@@ -2153,6 +2170,7 @@ struct processor_costs alderlake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  90,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -2266,6 +2284,7 @@ const struct processor_costs btver1_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
+  36,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
@@ -2376,6 +2395,7 @@ const struct processor_costs btver2_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
+  36,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
@@ -2485,6 +2505,7 @@ struct processor_costs pentium4_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
+  10,					/* cost of store forward stall.  */
   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
   16, 16,				/* Gather load static, per_elt.  */
@@ -2597,6 +2618,7 @@ struct processor_costs nocona_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
@@ -2707,6 +2729,7 @@ struct processor_costs atom_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  32,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
@@ -2817,6 +2840,7 @@ struct processor_costs slm_cost = {
 					   in SImode, DImode and TImode.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  48,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
@@ -2939,6 +2963,7 @@ struct processor_costs tremont_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  42,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3051,6 +3076,7 @@ struct processor_costs intel_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
+  22,					/* cost of store forward stall.  */
   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
   4,					/* cost of moving SSE register to integer.  */
   6, 6,					/* Gather load static, per_elt.  */
@@ -3168,6 +3194,7 @@ struct processor_costs generic_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  54,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3291,6 +3318,7 @@ struct processor_costs core_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   2,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..f8e0f2e26bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..f4ff7a83c82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
new file mode 100644
index 00000000000..6f853aa7750
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
@@ -0,0 +1,90 @@
+/* PR target/101908.  */
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -O2 -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not "add new stmt:.*MEM \<vector(2) double\>.*ray + 24B" "slp2" } }  */
+/* This testcase is used to avoid STLF stall.  */
+
+#define sqrt __builtin_sqrt
+#define SQ(x)		((x) * (x))
+struct vec3 {
+  double x, y, z;
+};
+
+struct ray {
+  struct vec3 orig, dir;
+};
+
+struct material {
+  struct vec3 col;	/* color */
+  double spow;		/* specular power */
+  double refl;		/* reflection intensity */
+};
+
+struct sphere {
+  struct vec3 pos;
+  double rad;
+  struct material mat;
+  struct sphere *next;
+};
+
+struct spoint {
+  struct vec3 pos, normal, vref;	/* position, normal and view reflection */
+  double dist;		/* parametric distance of intersection along the ray */
+};
+
+#define ERR_MARGIN		1e-6
+
+#define DOT(a, b)	((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
+#define NORMALIZE(a)  do {			\
+    double len = sqrt(DOT(a, a));		\
+    (a).x /= len; (a).y /= len; (a).z /= len;	\
+  } while(0);
+
+static struct vec3
+reflect(struct vec3 v, struct vec3 n) {
+  struct vec3 res;
+  double dot = v.x * n.x + v.y * n.y + v.z * n.z;
+  res.x = -(2.0 * dot * n.x - v.x);
+  res.y = -(2.0 * dot * n.y - v.y);
+  res.z = -(2.0 * dot * n.z - v.z);
+  return res;
+}
+
+int ray_sphere(const struct sphere *sph,
+	       struct ray ray, struct spoint *sp) {
+  double a, b, c, d, sqrt_d, t1, t2;
+	
+  a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
+  b = 2.0 * ray.dir.x * (ray.orig.x - sph->pos.x) +
+    2.0 * ray.dir.y * (ray.orig.y - sph->pos.y) +
+    2.0 * ray.dir.z * (ray.orig.z - sph->pos.z);
+  c = SQ(sph->pos.x) + SQ(sph->pos.y) + SQ(sph->pos.z) +
+    SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
+    2.0 * (-sph->pos.x * ray.orig.x - sph->pos.y * ray.orig.y - sph->pos.z * ray.orig.z) - SQ(sph->rad);
+	
+  if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
+
+  sqrt_d = sqrt(d);
+  t1 = (-b + sqrt_d) / (2.0 * a);
+  t2 = (-b - sqrt_d) / (2.0 * a);
+
+  if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
+
+  if(sp) {
+    if(t1 < ERR_MARGIN) t1 = t2;
+    if(t2 < ERR_MARGIN) t2 = t1;
+    sp->dist = t1 < t2 ? t1 : t2;
+		
+    sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
+    sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
+    sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
+		
+    sp->normal.x = (sp->pos.x - sph->pos.x) / sph->rad;
+    sp->normal.y = (sp->pos.y - sph->pos.y) / sph->rad;
+    sp->normal.z = (sp->pos.z - sph->pos.z) / sph->rad;
+
+    sp->vref = reflect(ray.dir, sp->normal);
+    NORMALIZE(sp->vref);
+  }
+  return 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
new file mode 100644
index 00000000000..fcd3ee8122f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
new file mode 100644
index 00000000000..6d43788600e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O3  -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[0] = x.a[1] + y.a[1];
+  p[1] = x.a[2] + y.a[2];
+  p[2] = x.a[3] + y.a[3];
+  p[3] = x.a[4] + y.a[4];
+  p[4] = x.a[5] + y.a[5];
+  p[5] = x.a[6] + y.a[6];
+  p[6] = x.a[7] + y.a[7];
+  p[7] = x.a[8] + y.a[8];
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
new file mode 100644
index 00000000000..f95b85abbc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16si.c b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
new file mode 100644
index 00000000000..5c48aa5da69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2df.c b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
new file mode 100644
index 00000000000..9d3f157718c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2di.c b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
new file mode 100644
index 00000000000..c7cf9a71f21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
new file mode 100644
index 00000000000..e6024d70780
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
new file mode 100644
index 00000000000..cf876cc70d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
new file mode 100644
index 00000000000..eb6349b957e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2si.c b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
new file mode 100644
index 00000000000..ae5fa0749c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4df.c b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
new file mode 100644
index 00000000000..94497422704
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4di.c b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
new file mode 100644
index 00000000000..71407aa9fc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
new file mode 100644
index 00000000000..4b207b91225
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
new file mode 100644
index 00000000000..5292d3442ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
new file mode 100644
index 00000000000..a2c6273120d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4si.c b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
new file mode 100644
index 00000000000..c6824285c74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
new file mode 100644
index 00000000000..248c6d0fb91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
new file mode 100644
index 00000000000..05eb2dd51d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
new file mode 100644
index 00000000000..b0055d7d2c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
new file mode 100644
index 00000000000..76a393bcc6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
new file mode 100644
index 00000000000..28977adae28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi-adl.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
new file mode 100644
index 00000000000..89b50885366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
new file mode 100644
index 00000000000..be668e5d006
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
new file mode 100644
index 00000000000..842c88c8952
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
new file mode 100644
index 00000000000..89d33566a40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
new file mode 100644
index 00000000000..81557c7b9b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
new file mode 100644
index 00000000000..883956a0d49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi-adl.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
new file mode 100644
index 00000000000..142f46012d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi.c"
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue.
  2022-03-08  3:39   ` Hongtao Liu
  2022-03-15  9:13     ` [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue liuhongt
  2022-03-16  1:03     ` liuhongt
@ 2022-03-16  2:19     ` liuhongt
  2022-03-16  9:54       ` Richard Biener
  2 siblings, 1 reply; 9+ messages in thread
From: liuhongt @ 2022-03-16  2:19 UTC (permalink / raw)
  To: gcc-patches

This patch only handle pure-slp for by-value passed parameter which
has nothing to do with IPA but psABI. For by-reference passed
parameter IPA is required.

The patch is aggressive in determining STLF failure, any
unaligned_load for parm_decl passed by stack is thought to have STLF
stall issue. It could lose some perf where there's no such issue(1
vector_load vs n scalar_load + CTOR).

According to microbenchmark in PR, cost of STLF failure is generally
between 8 scalar_loads and 16 scalar loads on most latest Intel/AMD
processors.

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
	(ix86_vector_costs::add_stmt_cost): Add extra cost for
	unsigned_load which may have store forwarding stall issue.
	* config/i386/i386.h (processor_costs): Add new member
	stfs.
	* config/i386/x86-tune-costs.h (i386_size_cost): Initialize
	stfs.
	(i386_cost, i486_cost, pentium_cost, lakemont_cost,
	pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost,
	amdfam10_cost, bdver_cost, znver1_cost, znver2_cost,
	znver3_cost, skylake_cost, icelake_cost, alderlake_cost,
	btver1_cost, btver2_cost, pentium4_cost, nocano_cost,
	atom_cost, slm_cost, tremont_cost, intel_cost, generic_cost,
	core_cost): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
	* gcc.target/i386/pr101908-3.c: New test.
	* gcc.target/i386/pr101908-v16hi.c: New test.
	* gcc.target/i386/pr101908-v16qi.c: New test.
	* gcc.target/i386/pr101908-v16sf.c: New test.
	* gcc.target/i386/pr101908-v16si.c: New test.
	* gcc.target/i386/pr101908-v2df.c: New test.
	* gcc.target/i386/pr101908-v2di.c: New test.
	* gcc.target/i386/pr101908-v2hi.c: New test.
	* gcc.target/i386/pr101908-v2qi.c: New test.
	* gcc.target/i386/pr101908-v2sf.c: New test.
	* gcc.target/i386/pr101908-v2si.c: New test.
	* gcc.target/i386/pr101908-v4df.c: New test.
	* gcc.target/i386/pr101908-v4di.c: New test.
	* gcc.target/i386/pr101908-v4hi.c: New test.
	* gcc.target/i386/pr101908-v4qi.c: New test.
	* gcc.target/i386/pr101908-v4sf.c: New test.
	* gcc.target/i386/pr101908-v4si.c: New test.
	* gcc.target/i386/pr101908-v8df-adl.c: New test.
	* gcc.target/i386/pr101908-v8df.c: New test.
	* gcc.target/i386/pr101908-v8di-adl.c: New test.
	* gcc.target/i386/pr101908-v8di.c: New test.
	* gcc.target/i386/pr101908-v8hi-adl.c: New test.
	* gcc.target/i386/pr101908-v8hi.c: New test.
	* gcc.target/i386/pr101908-v8qi-adl.c: New test.
	* gcc.target/i386/pr101908-v8qi.c: New test.
	* gcc.target/i386/pr101908-v8sf-adl.c: New test.
	* gcc.target/i386/pr101908-v8sf.c: New test.
	* gcc.target/i386/pr101908-v8si-adl.c: New test.
	* gcc.target/i386/pr101908-v8si.c: New test.
---
 gcc/config/i386/i386.cc                       | 51 +++++++++++
 gcc/config/i386/i386.h                        |  1 +
 gcc/config/i386/x86-tune-costs.h              | 28 ++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c    | 12 +++
 gcc/testsuite/gcc.target/i386/pr101908-2.c    | 12 +++
 gcc/testsuite/gcc.target/i386/pr101908-3.c    | 90 +++++++++++++++++++
 .../gcc.target/i386/pr101908-v16hi.c          |  6 ++
 .../gcc.target/i386/pr101908-v16qi.c          | 30 +++++++
 .../gcc.target/i386/pr101908-v16sf.c          |  6 ++
 .../gcc.target/i386/pr101908-v16si.c          |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2df.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2di.c |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v2si.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4df.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4di.c |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v4si.c |  6 ++
 .../gcc.target/i386/pr101908-v8df-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8df.c |  6 ++
 .../gcc.target/i386/pr101908-v8di-adl.c       |  7 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8di.c |  7 ++
 .../gcc.target/i386/pr101908-v8hi-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c |  6 ++
 .../gcc.target/i386/pr101908-v8qi-adl.c       | 22 +++++
 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c | 22 +++++
 .../gcc.target/i386/pr101908-v8sf-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c |  6 ++
 .../gcc.target/i386/pr101908-v8si-adl.c       |  6 ++
 gcc/testsuite/gcc.target/i386/pr101908-v8si.c |  6 ++
 34 files changed, 444 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4si.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d77ad83e437..c01809cc3da 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22988,6 +22988,46 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
+/* Return true if REF may have STF issue, otherwise false.
+   Any unaligned_load from parm_decl which is passed by stack
+   is considered to have STLF stall issue.  */
+static bool
+ix86_load_maybe_stfs_p (data_reference* dr)
+{
+  tree addr = DR_BASE_ADDRESS (dr);
+  if (TREE_CODE (addr) != ADDR_EXPR)
+    return false;
+  addr = get_base_address (TREE_OPERAND (addr, 0));
+
+  if (TREE_CODE (addr) != PARM_DECL)
+    return false;
+  tree type = TREE_TYPE (addr);
+  if (!type)
+    return false;
+
+  machine_mode mode = TYPE_MODE (type);
+
+  /* There could be false positive in determine parameter passed by stack.
+     .i.e. parameter can be put in registers but finally passed by stack
+     because registers are ran out.  */
+  if (TARGET_64BIT)
+    {
+      /* From function_arg_64.  */
+      enum x86_64_reg_class regclass[MAX_CLASSES];
+      int zero_width_bitfields = 0;
+      return !classify_argument (mode, type, regclass, 0, zero_width_bitfields);
+    }
+  else
+    {
+      /* From function_arg_32.  */
+      return (mode == E_BLKmode
+	      || (AGGREGATE_TYPE_P (type)
+		  && (VECTOR_MODE_P (mode) || mode == TImode)));
+    }
+
+  return false;
+}
+
 /* x86-specific vector costs.  */
 class ix86_vector_costs : public vector_costs
 {
@@ -23218,6 +23258,17 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
+     Performance may lose when there's no STF issue(1 vector_load vs n
+     scalar_load + CTOR).
+     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
+     tuned.  */
+  if (kind == unaligned_load && stmt_info
+      && stmt_info->slp_type == pure_slp
+      && STMT_VINFO_DATA_REF (stmt_info)
+      && ix86_load_maybe_stfs_p (STMT_VINFO_DATA_REF (stmt_info)))
+    stmt_cost += COSTS_N_INSNS (ix86_cost->stfs / 2);
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
       && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0d28e57f8f2..341f1c47981 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -168,6 +168,7 @@ struct processor_costs {
 				   in 32bit, 64bit, 128bit, 256bit and 512bit */
   const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_unaligned_store[5];/* cost of unaligned store.  */
+  const int stfs;		 /* cost of store forward stalls.  */
   const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
 	    zmm_move;
   const int sse_to_integer;	/* cost of moving SSE register to integer.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 017ffa69958..3a5fcdeefdd 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -100,6 +100,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
 					   in 128bit, 256bit and 512bit */
   {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
+  6,					/* cost of store forward stall.  */
   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   5, 0,					/* Gather load static, per_elt.  */
@@ -209,6 +210,7 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -317,6 +319,7 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -427,6 +430,7 @@ struct processor_costs pentium_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -528,6 +532,7 @@ struct processor_costs lakemont_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -644,6 +649,7 @@ struct processor_costs pentiumpro_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  24,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -751,6 +757,7 @@ struct processor_costs geode_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
@@ -858,6 +865,7 @@ struct processor_costs k6_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  24,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
@@ -971,6 +979,7 @@ struct processor_costs athlon_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1086,6 +1095,7 @@ struct processor_costs k8_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
+  14,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1214,6 +1224,7 @@ struct processor_costs amdfam10_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  21,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
@@ -1334,6 +1345,7 @@ const struct processor_costs bdver_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
+  54,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   16,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
@@ -1475,6 +1487,7 @@ struct processor_costs znver1_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
   6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
@@ -1630,6 +1643,7 @@ struct processor_costs znver2_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
@@ -1762,6 +1776,7 @@ struct processor_costs znver3_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  42,					/* cost of store forward stall.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
@@ -1907,6 +1922,7 @@ struct processor_costs skylake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
@@ -2033,6 +2049,7 @@ struct processor_costs icelake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
@@ -2153,6 +2170,7 @@ struct processor_costs alderlake_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  90,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -2266,6 +2284,7 @@ const struct processor_costs btver1_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
+  36,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
@@ -2376,6 +2395,7 @@ const struct processor_costs btver2_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
+  36,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
@@ -2485,6 +2505,7 @@ struct processor_costs pentium4_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
+  10,					/* cost of store forward stall.  */
   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
   16, 16,				/* Gather load static, per_elt.  */
@@ -2597,6 +2618,7 @@ struct processor_costs nocona_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
+  8,					/* cost of store forward stall.  */
   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
@@ -2707,6 +2729,7 @@ struct processor_costs atom_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  32,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
@@ -2817,6 +2840,7 @@ struct processor_costs slm_cost = {
 					   in SImode, DImode and TImode.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  48,					/* cost of store forward stall.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
@@ -2939,6 +2963,7 @@ struct processor_costs tremont_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  42,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3051,6 +3076,7 @@ struct processor_costs intel_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
+  22,					/* cost of store forward stall.  */
   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
   4,					/* cost of moving SSE register to integer.  */
   6, 6,					/* Gather load static, per_elt.  */
@@ -3168,6 +3194,7 @@ struct processor_costs generic_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  54,					/* cost of store forward stall.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3291,6 +3318,7 @@ struct processor_costs core_cost = {
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
+  26,					/* cost of store forward stall.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   2,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..f8e0f2e26bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..f4ff7a83c82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
new file mode 100644
index 00000000000..6f853aa7750
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
@@ -0,0 +1,90 @@
+/* PR target/101908.  */
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -O2 -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not "add new stmt:.*MEM \<vector(2) double\>.*ray + 24B" "slp2" } }  */
+/* This testcase is used to avoid STLF stall.  */
+
+#define sqrt __builtin_sqrt
+#define SQ(x)		((x) * (x))
+struct vec3 {
+  double x, y, z;
+};
+
+struct ray {
+  struct vec3 orig, dir;
+};
+
+struct material {
+  struct vec3 col;	/* color */
+  double spow;		/* specular power */
+  double refl;		/* reflection intensity */
+};
+
+struct sphere {
+  struct vec3 pos;
+  double rad;
+  struct material mat;
+  struct sphere *next;
+};
+
+struct spoint {
+  struct vec3 pos, normal, vref;	/* position, normal and view reflection */
+  double dist;		/* parametric distance of intersection along the ray */
+};
+
+#define ERR_MARGIN		1e-6
+
+#define DOT(a, b)	((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
+#define NORMALIZE(a)  do {			\
+    double len = sqrt(DOT(a, a));		\
+    (a).x /= len; (a).y /= len; (a).z /= len;	\
+  } while(0);
+
+static struct vec3
+reflect(struct vec3 v, struct vec3 n) {
+  struct vec3 res;
+  double dot = v.x * n.x + v.y * n.y + v.z * n.z;
+  res.x = -(2.0 * dot * n.x - v.x);
+  res.y = -(2.0 * dot * n.y - v.y);
+  res.z = -(2.0 * dot * n.z - v.z);
+  return res;
+}
+
+int ray_sphere(const struct sphere *sph,
+	       struct ray ray, struct spoint *sp) {
+  double a, b, c, d, sqrt_d, t1, t2;
+	
+  a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
+  b = 2.0 * ray.dir.x * (ray.orig.x - sph->pos.x) +
+    2.0 * ray.dir.y * (ray.orig.y - sph->pos.y) +
+    2.0 * ray.dir.z * (ray.orig.z - sph->pos.z);
+  c = SQ(sph->pos.x) + SQ(sph->pos.y) + SQ(sph->pos.z) +
+    SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
+    2.0 * (-sph->pos.x * ray.orig.x - sph->pos.y * ray.orig.y - sph->pos.z * ray.orig.z) - SQ(sph->rad);
+	
+  if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
+
+  sqrt_d = sqrt(d);
+  t1 = (-b + sqrt_d) / (2.0 * a);
+  t2 = (-b - sqrt_d) / (2.0 * a);
+
+  if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
+
+  if(sp) {
+    if(t1 < ERR_MARGIN) t1 = t2;
+    if(t2 < ERR_MARGIN) t2 = t1;
+    sp->dist = t1 < t2 ? t1 : t2;
+		
+    sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
+    sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
+    sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
+		
+    sp->normal.x = (sp->pos.x - sph->pos.x) / sph->rad;
+    sp->normal.y = (sp->pos.y - sph->pos.y) / sph->rad;
+    sp->normal.z = (sp->pos.z - sph->pos.z) / sph->rad;
+
+    sp->vref = reflect(ray.dir, sp->normal);
+    NORMALIZE(sp->vref);
+  }
+  return 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
new file mode 100644
index 00000000000..fcd3ee8122f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
new file mode 100644
index 00000000000..6d43788600e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O3  -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[0] = x.a[1] + y.a[1];
+  p[1] = x.a[2] + y.a[2];
+  p[2] = x.a[3] + y.a[3];
+  p[3] = x.a[4] + y.a[4];
+  p[4] = x.a[5] + y.a[5];
+  p[5] = x.a[6] + y.a[6];
+  p[6] = x.a[7] + y.a[7];
+  p[7] = x.a[8] + y.a[8];
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
new file mode 100644
index 00000000000..f95b85abbc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16si.c b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
new file mode 100644
index 00000000000..5c48aa5da69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v16qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2df.c b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
new file mode 100644
index 00000000000..9d3f157718c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2di.c b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
new file mode 100644
index 00000000000..c7cf9a71f21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
new file mode 100644
index 00000000000..e6024d70780
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
new file mode 100644
index 00000000000..cf876cc70d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
new file mode 100644
index 00000000000..eb6349b957e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2si.c b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
new file mode 100644
index 00000000000..ae5fa0749c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v2qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4df.c b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
new file mode 100644
index 00000000000..94497422704
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4di.c b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
new file mode 100644
index 00000000000..71407aa9fc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
new file mode 100644
index 00000000000..4b207b91225
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
new file mode 100644
index 00000000000..5292d3442ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
new file mode 100644
index 00000000000..a2c6273120d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4si.c b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
new file mode 100644
index 00000000000..c6824285c74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v4qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
new file mode 100644
index 00000000000..248c6d0fb91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
new file mode 100644
index 00000000000..05eb2dd51d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
+
+#define TYPE double
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
new file mode 100644
index 00000000000..b0055d7d2c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
new file mode 100644
index 00000000000..76a393bcc6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
+
+typedef long long int64_t;
+#define TYPE int64_t
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
new file mode 100644
index 00000000000..28977adae28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi-adl.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
new file mode 100644
index 00000000000..89b50885366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
+
+#define TYPE short
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
new file mode 100644
index 00000000000..be668e5d006
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
new file mode 100644
index 00000000000..842c88c8952
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
+
+#ifndef TYPE
+#define TYPE char
+#endif
+
+struct X { TYPE a[128]; };
+
+void __attribute__((noipa))
+foo16 (struct X x, struct X y, TYPE* __restrict p)
+{
+  p[8] = x.a[9] + y.a[9];
+  p[9] = x.a[10] + y.a[10];
+  p[10] = x.a[11] + y.a[11];
+  p[11] = x.a[12] + y.a[12];
+  p[12] = x.a[13] + y.a[13];
+  p[13] = x.a[14] + y.a[14];
+  p[14] = x.a[15] + y.a[15];
+  p[15] = x.a[16] + y.a[16];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
new file mode 100644
index 00000000000..89d33566a40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
new file mode 100644
index 00000000000..81557c7b9b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
+
+#define TYPE float
+#include "pr101908-v8qi.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
new file mode 100644
index 00000000000..883956a0d49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi-adl.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
new file mode 100644
index 00000000000..142f46012d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
+
+#define TYPE int
+#include "pr101908-v8qi.c"
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue.
  2022-03-16  2:19     ` liuhongt
@ 2022-03-16  9:54       ` Richard Biener
  2022-03-17  7:12         ` Hongtao Liu
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2022-03-16  9:54 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches

On Wed, Mar 16, 2022 at 3:19 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> This patch only handle pure-slp for by-value passed parameter which
> has nothing to do with IPA but psABI. For by-reference passed
> parameter IPA is required.
>
> The patch is aggressive in determining STLF failure, any
> unaligned_load for parm_decl passed by stack is thought to have STLF
> stall issue. It could lose some perf where there's no such issue(1
> vector_load vs n scalar_load + CTOR).
>
> According to microbenchmark in PR, cost of STLF failure is generally
> between 8 scalar_loads and 16 scalar loads on most latest Intel/AMD
> processors.
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
>         (ix86_vector_costs::add_stmt_cost): Add extra cost for
>         unsigned_load which may have store forwarding stall issue.
>         * config/i386/i386.h (processor_costs): Add new member
>         stfs.
>         * config/i386/x86-tune-costs.h (i386_size_cost): Initialize
>         stfs.
>         (i386_cost, i486_cost, pentium_cost, lakemont_cost,
>         pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost,
>         amdfam10_cost, bdver_cost, znver1_cost, znver2_cost,
>         znver3_cost, skylake_cost, icelake_cost, alderlake_cost,
>         btver1_cost, btver2_cost, pentium4_cost, nocano_cost,
>         atom_cost, slm_cost, tremont_cost, intel_cost, generic_cost,
>         core_cost): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
>         * gcc.target/i386/pr101908-3.c: New test.
>         * gcc.target/i386/pr101908-v16hi.c: New test.
>         * gcc.target/i386/pr101908-v16qi.c: New test.
>         * gcc.target/i386/pr101908-v16sf.c: New test.
>         * gcc.target/i386/pr101908-v16si.c: New test.
>         * gcc.target/i386/pr101908-v2df.c: New test.
>         * gcc.target/i386/pr101908-v2di.c: New test.
>         * gcc.target/i386/pr101908-v2hi.c: New test.
>         * gcc.target/i386/pr101908-v2qi.c: New test.
>         * gcc.target/i386/pr101908-v2sf.c: New test.
>         * gcc.target/i386/pr101908-v2si.c: New test.
>         * gcc.target/i386/pr101908-v4df.c: New test.
>         * gcc.target/i386/pr101908-v4di.c: New test.
>         * gcc.target/i386/pr101908-v4hi.c: New test.
>         * gcc.target/i386/pr101908-v4qi.c: New test.
>         * gcc.target/i386/pr101908-v4sf.c: New test.
>         * gcc.target/i386/pr101908-v4si.c: New test.
>         * gcc.target/i386/pr101908-v8df-adl.c: New test.
>         * gcc.target/i386/pr101908-v8df.c: New test.
>         * gcc.target/i386/pr101908-v8di-adl.c: New test.
>         * gcc.target/i386/pr101908-v8di.c: New test.
>         * gcc.target/i386/pr101908-v8hi-adl.c: New test.
>         * gcc.target/i386/pr101908-v8hi.c: New test.
>         * gcc.target/i386/pr101908-v8qi-adl.c: New test.
>         * gcc.target/i386/pr101908-v8qi.c: New test.
>         * gcc.target/i386/pr101908-v8sf-adl.c: New test.
>         * gcc.target/i386/pr101908-v8sf.c: New test.
>         * gcc.target/i386/pr101908-v8si-adl.c: New test.
>         * gcc.target/i386/pr101908-v8si.c: New test.
> ---
>  gcc/config/i386/i386.cc                       | 51 +++++++++++
>  gcc/config/i386/i386.h                        |  1 +
>  gcc/config/i386/x86-tune-costs.h              | 28 ++++++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c    | 12 +++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c    | 12 +++
>  gcc/testsuite/gcc.target/i386/pr101908-3.c    | 90 +++++++++++++++++++
>  .../gcc.target/i386/pr101908-v16hi.c          |  6 ++
>  .../gcc.target/i386/pr101908-v16qi.c          | 30 +++++++
>  .../gcc.target/i386/pr101908-v16sf.c          |  6 ++
>  .../gcc.target/i386/pr101908-v16si.c          |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v2df.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v2di.c |  7 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v2hi.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v2qi.c | 16 ++++
>  gcc/testsuite/gcc.target/i386/pr101908-v2sf.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v2si.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v4df.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v4di.c |  7 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v4hi.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v4qi.c | 18 ++++
>  gcc/testsuite/gcc.target/i386/pr101908-v4sf.c |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v4si.c |  6 ++
>  .../gcc.target/i386/pr101908-v8df-adl.c       |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v8df.c |  6 ++
>  .../gcc.target/i386/pr101908-v8di-adl.c       |  7 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v8di.c |  7 ++
>  .../gcc.target/i386/pr101908-v8hi-adl.c       |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v8hi.c |  6 ++
>  .../gcc.target/i386/pr101908-v8qi-adl.c       | 22 +++++
>  gcc/testsuite/gcc.target/i386/pr101908-v8qi.c | 22 +++++
>  .../gcc.target/i386/pr101908-v8sf-adl.c       |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v8sf.c |  6 ++
>  .../gcc.target/i386/pr101908-v8si-adl.c       |  6 ++
>  gcc/testsuite/gcc.target/i386/pr101908-v8si.c |  6 ++
>  34 files changed, 444 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16si.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2df.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2di.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2si.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4df.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4di.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4si.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index d77ad83e437..c01809cc3da 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -22988,6 +22988,46 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
>    return default_noce_conversion_profitable_p (seq, if_info);
>  }
>
> +/* Return true if REF may have STF issue, otherwise false.
> +   Any unaligned_load from parm_decl which is passed by stack
> +   is considered to have STLF stall issue.  */
> +static bool
> +ix86_load_maybe_stfs_p (data_reference* dr)
> +{
> +  tree addr = DR_BASE_ADDRESS (dr);
> +  if (TREE_CODE (addr) != ADDR_EXPR)
> +    return false;
> +  addr = get_base_address (TREE_OPERAND (addr, 0));
> +
> +  if (TREE_CODE (addr) != PARM_DECL)
> +    return false;
> +  tree type = TREE_TYPE (addr);
> +  if (!type)

type should never be NULL

> +    return false;
> +
> +  machine_mode mode = TYPE_MODE (type);
> +
> +  /* There could be false positive in determine parameter passed by stack.
> +     .i.e. parameter can be put in registers but finally passed by stack
> +     because registers are ran out.  */
> +  if (TARGET_64BIT)
> +    {
> +      /* From function_arg_64.  */
> +      enum x86_64_reg_class regclass[MAX_CLASSES];
> +      int zero_width_bitfields = 0;
> +      return !classify_argument (mode, type, regclass, 0, zero_width_bitfields);
> +    }
> +  else
> +    {
> +      /* From function_arg_32.  */
> +      return (mode == E_BLKmode
> +             || (AGGREGATE_TYPE_P (type)
> +                 && (VECTOR_MODE_P (mode) || mode == TImode)));
> +    }
> +
> +  return false;

that stmt is unreachable.

> +}
> +
>  /* x86-specific vector costs.  */
>  class ix86_vector_costs : public vector_costs
>  {
> @@ -23218,6 +23258,17 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
>    if (stmt_cost == -1)
>      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
>
> +  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
> +     Performance may lose when there's no STF issue(1 vector_load vs n
> +     scalar_load + CTOR).
> +     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine

cost(2000) is no longer there

> +     tuned.  */
> +  if (kind == unaligned_load && stmt_info
> +      && stmt_info->slp_type == pure_slp

You want to restrict this to BB vectorization?  pure_slp isn't exactly that,
instead you can do

           && is_a <bb_vec_info> (m_vinfo)

> +      && STMT_VINFO_DATA_REF (stmt_info)
> +      && ix86_load_maybe_stfs_p (STMT_VINFO_DATA_REF (stmt_info)))
> +    stmt_cost += COSTS_N_INSNS (ix86_cost->stfs / 2);

I wonder why we divide stfs by two?

I'd suggest an additional check, that the DR is close to function start.  One
possible check that occurs to me is to check

  STMT_VINFO_DR_INFO (stmt_info)->group == 0

that will for example avoid the penalty for

struct Y y;
void foo (struct X x)
{
  bar();
  y.a = x.a;
  y.b = x.b;
}

but also (maybe not wanted) when the access happens after control
flow transfer like with

struct Y y;
void foo (struct X x, int flag)
{
  if (flag)
   {
    y.a = x.a;
    y.b = x.b;
   }
}

I think we should be conservative with what we pessimize until we have
evidence that we need to include more cases, also since this after-the-fact
handling of the issue in costing is sub-optimal.  Ideally the vectorizer itself
would decide the vectorize the load in a way to avoid STLF fails, but that's
nothing we can easily arrange for at this stage.

Another option could be to split such loads during md-reorg where we could
somehow "count" the latency from function entry, only scanning paths from
there up to a point where the store buffer is likely not drained (with
a different
target cost parameter?) and only scanning not optimize_for_size BBs.  That
might be a better place to do after-the-fact adjustments (the cost adjustment
won't avoid the STLF fail if the rest of the vectorization compensates the
penalty).

Richard.

> +
>    /* Penalize DFmode vector operations for Bonnell.  */
>    if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
>        && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 0d28e57f8f2..341f1c47981 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -168,6 +168,7 @@ struct processor_costs {
>                                    in 32bit, 64bit, 128bit, 256bit and 512bit */
>    const int sse_unaligned_load[5];/* cost of unaligned load.  */
>    const int sse_unaligned_store[5];/* cost of unaligned store.  */
> +  const int stfs;               /* cost of store forward stalls.  */
>    const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
>             zmm_move;
>    const int sse_to_integer;    /* cost of moving SSE register to integer.  */
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index 017ffa69958..3a5fcdeefdd 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -100,6 +100,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
>                                            in 128bit, 256bit and 512bit */
>    {3, 3, 3, 3, 3},                     /* cost of unaligned SSE store
>                                            in 128bit, 256bit and 512bit */
> +  6,                                   /* cost of store forward stall.  */
>    3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    5, 0,                                        /* Gather load static, per_elt.  */
> @@ -209,6 +210,7 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> +  8,                                   /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -317,6 +319,7 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> +  8,                                   /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -427,6 +430,7 @@ struct processor_costs pentium_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> +  8,                                   /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -528,6 +532,7 @@ struct processor_costs lakemont_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> +  8,                                   /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -644,6 +649,7 @@ struct processor_costs pentiumpro_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
>    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> +  24,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -751,6 +757,7 @@ struct processor_costs geode_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
>    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
> +  14,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    2, 2,                                        /* Gather load static, per_elt.  */
> @@ -858,6 +865,7 @@ struct processor_costs k6_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
>    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
> +  24,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    2, 2,                                        /* Gather load static, per_elt.  */
> @@ -971,6 +979,7 @@ struct processor_costs athlon_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 4, 12, 12, 24},                  /* cost of unaligned loads.  */
>    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
> +  14,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    5,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -1086,6 +1095,7 @@ struct processor_costs k8_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 3, 12, 12, 24},                  /* cost of unaligned loads.  */
>    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
> +  14,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    5,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -1214,6 +1224,7 @@ struct processor_costs amdfam10_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
>    {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
> +  21,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    3,                                   /* cost of moving SSE register to integer.  */
>    4, 4,                                        /* Gather load static, per_elt.  */
> @@ -1334,6 +1345,7 @@ const struct processor_costs bdver_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {12, 12, 10, 40, 60},                        /* cost of unaligned loads.  */
>    {10, 10, 10, 40, 60},                        /* cost of unaligned stores.  */
> +  54,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    16,                                  /* cost of moving SSE register to integer.  */
>    12, 12,                              /* Gather load static, per_elt.  */
> @@ -1475,6 +1487,7 @@ struct processor_costs znver1_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 12, 24},                   /* cost of unaligned loads.  */
>    {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
> +  42,                                  /* cost of store forward stall.  */
>    2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
>    6,                                   /* cost of moving SSE register to integer.  */
>    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> @@ -1630,6 +1643,7 @@ struct processor_costs znver2_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
>    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> +  42,                                  /* cost of store forward stall.  */
>    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
>                                            register.  */
>    6,                                   /* cost of moving SSE register to integer.  */
> @@ -1762,6 +1776,7 @@ struct processor_costs znver3_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
>    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> +  42,                                  /* cost of store forward stall.  */
>    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
>                                            register.  */
>    6,                                   /* cost of moving SSE register to integer.  */
> @@ -1907,6 +1922,7 @@ struct processor_costs skylake_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
>    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> +  26,                                  /* cost of store forward stall.  */
>    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    20, 8,                               /* Gather load static, per_elt.  */
> @@ -2033,6 +2049,7 @@ struct processor_costs icelake_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
>    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> +  26,                                  /* cost of store forward stall.  */
>    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    20, 8,                               /* Gather load static, per_elt.  */
> @@ -2153,6 +2170,7 @@ struct processor_costs alderlake_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
>    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> +  90,                                  /* cost of store forward stall.  */
>    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    18, 6,                               /* Gather load static, per_elt.  */
> @@ -2266,6 +2284,7 @@ const struct processor_costs btver1_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
>    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
> +  36,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    14,                                  /* cost of moving SSE register to integer.  */
>    10, 10,                              /* Gather load static, per_elt.  */
> @@ -2376,6 +2395,7 @@ const struct processor_costs btver2_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
>    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
> +  36,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    14,                                  /* cost of moving SSE register to integer.  */
>    10, 10,                              /* Gather load static, per_elt.  */
> @@ -2485,6 +2505,7 @@ struct processor_costs pentium4_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {32, 32, 32, 64, 128},               /* cost of unaligned loads.  */
>    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
> +  10,                                  /* cost of store forward stall.  */
>    12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
>    20,                                  /* cost of moving SSE register to integer.  */
>    16, 16,                              /* Gather load static, per_elt.  */
> @@ -2597,6 +2618,7 @@ struct processor_costs nocona_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {24, 24, 24, 48, 96},                        /* cost of unaligned loads.  */
>    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
> +  8,                                   /* cost of store forward stall.  */
>    6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
>    20,                                  /* cost of moving SSE register to integer.  */
>    12, 12,                              /* Gather load static, per_elt.  */
> @@ -2707,6 +2729,7 @@ struct processor_costs atom_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
>    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
> +  32,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    8,                                   /* cost of moving SSE register to integer.  */
>    8, 8,                                        /* Gather load static, per_elt.  */
> @@ -2817,6 +2840,7 @@ struct processor_costs slm_cost = {
>                                            in SImode, DImode and TImode.  */
>    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
>    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
> +  48,                                  /* cost of store forward stall.  */
>    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
>    8,                                   /* cost of moving SSE register to integer.  */
>    8, 8,                                        /* Gather load static, per_elt.  */
> @@ -2939,6 +2963,7 @@ struct processor_costs tremont_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
>    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> +  42,                                  /* cost of store forward stall.  */
>    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    18, 6,                               /* Gather load static, per_elt.  */
> @@ -3051,6 +3076,7 @@ struct processor_costs intel_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
>    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
> +  22,                                  /* cost of store forward stall.  */
>    2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
>    4,                                   /* cost of moving SSE register to integer.  */
>    6, 6,                                        /* Gather load static, per_elt.  */
> @@ -3168,6 +3194,7 @@ struct processor_costs generic_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
>    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> +  54,                                  /* cost of store forward stall.  */
>    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
>    6,                                   /* cost of moving SSE register to integer.  */
>    18, 6,                               /* Gather load static, per_elt.  */
> @@ -3291,6 +3318,7 @@ struct processor_costs core_cost = {
>                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
>    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
>    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
> +  26,                                  /* cost of store forward stall.  */
>    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
>    2,                                   /* cost of moving SSE register to integer.  */
>    /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..f8e0f2e26bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..f4ff7a83c82
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +struct X { double x[4]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
> new file mode 100644
> index 00000000000..6f853aa7750
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
> @@ -0,0 +1,90 @@
> +/* PR target/101908.  */
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -O2 -mtune=generic -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not "add new stmt:.*MEM \<vector(2) double\>.*ray + 24B" "slp2" } }  */
> +/* This testcase is used to avoid STLF stall.  */
> +
> +#define sqrt __builtin_sqrt
> +#define SQ(x)          ((x) * (x))
> +struct vec3 {
> +  double x, y, z;
> +};
> +
> +struct ray {
> +  struct vec3 orig, dir;
> +};
> +
> +struct material {
> +  struct vec3 col;     /* color */
> +  double spow;         /* specular power */
> +  double refl;         /* reflection intensity */
> +};
> +
> +struct sphere {
> +  struct vec3 pos;
> +  double rad;
> +  struct material mat;
> +  struct sphere *next;
> +};
> +
> +struct spoint {
> +  struct vec3 pos, normal, vref;       /* position, normal and view reflection */
> +  double dist;         /* parametric distance of intersection along the ray */
> +};
> +
> +#define ERR_MARGIN             1e-6
> +
> +#define DOT(a, b)      ((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
> +#define NORMALIZE(a)  do {                     \
> +    double len = sqrt(DOT(a, a));              \
> +    (a).x /= len; (a).y /= len; (a).z /= len;  \
> +  } while(0);
> +
> +static struct vec3
> +reflect(struct vec3 v, struct vec3 n) {
> +  struct vec3 res;
> +  double dot = v.x * n.x + v.y * n.y + v.z * n.z;
> +  res.x = -(2.0 * dot * n.x - v.x);
> +  res.y = -(2.0 * dot * n.y - v.y);
> +  res.z = -(2.0 * dot * n.z - v.z);
> +  return res;
> +}
> +
> +int ray_sphere(const struct sphere *sph,
> +              struct ray ray, struct spoint *sp) {
> +  double a, b, c, d, sqrt_d, t1, t2;
> +
> +  a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
> +  b = 2.0 * ray.dir.x * (ray.orig.x - sph->pos.x) +
> +    2.0 * ray.dir.y * (ray.orig.y - sph->pos.y) +
> +    2.0 * ray.dir.z * (ray.orig.z - sph->pos.z);
> +  c = SQ(sph->pos.x) + SQ(sph->pos.y) + SQ(sph->pos.z) +
> +    SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
> +    2.0 * (-sph->pos.x * ray.orig.x - sph->pos.y * ray.orig.y - sph->pos.z * ray.orig.z) - SQ(sph->rad);
> +
> +  if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
> +
> +  sqrt_d = sqrt(d);
> +  t1 = (-b + sqrt_d) / (2.0 * a);
> +  t2 = (-b - sqrt_d) / (2.0 * a);
> +
> +  if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
> +
> +  if(sp) {
> +    if(t1 < ERR_MARGIN) t1 = t2;
> +    if(t2 < ERR_MARGIN) t2 = t1;
> +    sp->dist = t1 < t2 ? t1 : t2;
> +
> +    sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
> +    sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
> +    sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
> +
> +    sp->normal.x = (sp->pos.x - sph->pos.x) / sph->rad;
> +    sp->normal.y = (sp->pos.y - sph->pos.y) / sph->rad;
> +    sp->normal.z = (sp->pos.z - sph->pos.z) / sph->rad;
> +
> +    sp->vref = reflect(ray.dir, sp->normal);
> +    NORMALIZE(sp->vref);
> +  }
> +  return 1;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
> new file mode 100644
> index 00000000000..fcd3ee8122f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) short int\>} "slp2" } } */
> +
> +#define TYPE short
> +#include "pr101908-v16qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
> new file mode 100644
> index 00000000000..6d43788600e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3  -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) char\>} "slp2" } } */
> +
> +#ifndef TYPE
> +#define TYPE char
> +#endif
> +
> +struct X { TYPE a[128]; };
> +
> +void __attribute__((noipa))
> +foo16 (struct X x, struct X y, TYPE* __restrict p)
> +{
> +  p[0] = x.a[1] + y.a[1];
> +  p[1] = x.a[2] + y.a[2];
> +  p[2] = x.a[3] + y.a[3];
> +  p[3] = x.a[4] + y.a[4];
> +  p[4] = x.a[5] + y.a[5];
> +  p[5] = x.a[6] + y.a[6];
> +  p[6] = x.a[7] + y.a[7];
> +  p[7] = x.a[8] + y.a[8];
> +  p[8] = x.a[9] + y.a[9];
> +  p[9] = x.a[10] + y.a[10];
> +  p[10] = x.a[11] + y.a[11];
> +  p[11] = x.a[12] + y.a[12];
> +  p[12] = x.a[13] + y.a[13];
> +  p[13] = x.a[14] + y.a[14];
> +  p[14] = x.a[15] + y.a[15];
> +  p[15] = x.a[16] + y.a[16];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
> new file mode 100644
> index 00000000000..f95b85abbc6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) float\>} "slp2" } } */
> +
> +#define TYPE float
> +#include "pr101908-v16qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16si.c b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
> new file mode 100644
> index 00000000000..5c48aa5da69
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) int\>} "slp2" } } */
> +
> +#define TYPE int
> +#include "pr101908-v16qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2df.c b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
> new file mode 100644
> index 00000000000..9d3f157718c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> +
> +#define TYPE double
> +#include "pr101908-v2qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2di.c b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
> new file mode 100644
> index 00000000000..c7cf9a71f21
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) long long int\>} "slp2" } } */
> +
> +typedef long long int64_t;
> +#define TYPE int64_t
> +#include "pr101908-v2qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
> new file mode 100644
> index 00000000000..e6024d70780
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) short int\>} "slp2" } } */
> +
> +#define TYPE short
> +#include "pr101908-v2qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
> new file mode 100644
> index 00000000000..cf876cc70d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) char\>} "slp2" } } */
> +
> +#ifndef TYPE
> +#define TYPE char
> +#endif
> +
> +struct X { TYPE a[128]; };
> +
> +void __attribute__((noipa))
> +foo16 (struct X x, struct X y, TYPE* __restrict p)
> +{
> +  p[14] = x.a[15] + y.a[15];
> +  p[15] = x.a[16] + y.a[16];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
> new file mode 100644
> index 00000000000..eb6349b957e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) float\>} "slp2" } } */
> +
> +#define TYPE float
> +#include "pr101908-v2qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2si.c b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
> new file mode 100644
> index 00000000000..ae5fa0749c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) int\>} "slp2" } } */
> +
> +#define TYPE int
> +#include "pr101908-v2qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4df.c b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
> new file mode 100644
> index 00000000000..94497422704
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) double\>} "slp2" } } */
> +
> +#define TYPE double
> +#include "pr101908-v4qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4di.c b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
> new file mode 100644
> index 00000000000..71407aa9fc7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) long long int\>} "slp2" } } */
> +
> +typedef long long int64_t;
> +#define TYPE int64_t
> +#include "pr101908-v4qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
> new file mode 100644
> index 00000000000..4b207b91225
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) short int\>} "slp2" } } */
> +
> +#define TYPE short
> +#include "pr101908-v4qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
> new file mode 100644
> index 00000000000..5292d3442ec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) char\>} "slp2" } } */
> +
> +#ifndef TYPE
> +#define TYPE char
> +#endif
> +
> +struct X { TYPE a[128]; };
> +
> +void __attribute__((noipa))
> +foo16 (struct X x, struct X y, TYPE* __restrict p)
> +{
> +  p[12] = x.a[13] + y.a[13];
> +  p[13] = x.a[14] + y.a[14];
> +  p[14] = x.a[15] + y.a[15];
> +  p[15] = x.a[16] + y.a[16];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
> new file mode 100644
> index 00000000000..a2c6273120d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) float\>} "slp2" } } */
> +
> +#define TYPE float
> +#include "pr101908-v4qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4si.c b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
> new file mode 100644
> index 00000000000..c6824285c74
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) int\>} "slp2" } } */
> +
> +#define TYPE int
> +#include "pr101908-v4qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
> new file mode 100644
> index 00000000000..248c6d0fb91
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
> +
> +#define TYPE double
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
> new file mode 100644
> index 00000000000..05eb2dd51d0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
> +
> +#define TYPE double
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
> new file mode 100644
> index 00000000000..b0055d7d2c0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
> +
> +typedef long long int64_t;
> +#define TYPE int64_t
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
> new file mode 100644
> index 00000000000..76a393bcc6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
> +
> +typedef long long int64_t;
> +#define TYPE int64_t
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
> new file mode 100644
> index 00000000000..28977adae28
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
> +
> +#define TYPE short
> +#include "pr101908-v8qi-adl.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
> new file mode 100644
> index 00000000000..89b50885366
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
> +
> +#define TYPE short
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
> new file mode 100644
> index 00000000000..be668e5d006
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
> +
> +#ifndef TYPE
> +#define TYPE char
> +#endif
> +
> +struct X { TYPE a[128]; };
> +
> +void __attribute__((noipa))
> +foo16 (struct X x, struct X y, TYPE* __restrict p)
> +{
> +  p[8] = x.a[9] + y.a[9];
> +  p[9] = x.a[10] + y.a[10];
> +  p[10] = x.a[11] + y.a[11];
> +  p[11] = x.a[12] + y.a[12];
> +  p[12] = x.a[13] + y.a[13];
> +  p[13] = x.a[14] + y.a[14];
> +  p[14] = x.a[15] + y.a[15];
> +  p[15] = x.a[16] + y.a[16];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
> new file mode 100644
> index 00000000000..842c88c8952
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
> +
> +#ifndef TYPE
> +#define TYPE char
> +#endif
> +
> +struct X { TYPE a[128]; };
> +
> +void __attribute__((noipa))
> +foo16 (struct X x, struct X y, TYPE* __restrict p)
> +{
> +  p[8] = x.a[9] + y.a[9];
> +  p[9] = x.a[10] + y.a[10];
> +  p[10] = x.a[11] + y.a[11];
> +  p[11] = x.a[12] + y.a[12];
> +  p[12] = x.a[13] + y.a[13];
> +  p[13] = x.a[14] + y.a[14];
> +  p[14] = x.a[15] + y.a[15];
> +  p[15] = x.a[16] + y.a[16];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
> new file mode 100644
> index 00000000000..89d33566a40
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
> +
> +#define TYPE float
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
> new file mode 100644
> index 00000000000..81557c7b9b7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
> +
> +#define TYPE float
> +#include "pr101908-v8qi.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
> new file mode 100644
> index 00000000000..883956a0d49
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
> +
> +#define TYPE int
> +#include "pr101908-v8qi-adl.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
> new file mode 100644
> index 00000000000..142f46012d7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
> +
> +#define TYPE int
> +#include "pr101908-v8qi.c"
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue.
  2022-03-16  9:54       ` Richard Biener
@ 2022-03-17  7:12         ` Hongtao Liu
  0 siblings, 0 replies; 9+ messages in thread
From: Hongtao Liu @ 2022-03-17  7:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, GCC Patches

On Wed, Mar 16, 2022 at 5:54 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Wed, Mar 16, 2022 at 3:19 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > This patch only handle pure-slp for by-value passed parameter which
> > has nothing to do with IPA but psABI. For by-reference passed
> > parameter IPA is required.
> >
> > The patch is aggressive in determining STLF failure, any
> > unaligned_load for parm_decl passed by stack is thought to have STLF
> > stall issue. It could lose some perf where there's no such issue(1
> > vector_load vs n scalar_load + CTOR).
> >
> > According to microbenchmark in PR, cost of STLF failure is generally
> > between 8 scalar_loads and 16 scalar loads on most latest Intel/AMD
> > processors.
> >
> > gcc/ChangeLog:
> >
> >         PR target/101908
> >         * config/i386/i386.cc (ix86_load_maybe_stfs_p): New.
> >         (ix86_vector_costs::add_stmt_cost): Add extra cost for
> >         unsigned_load which may have store forwarding stall issue.
> >         * config/i386/i386.h (processor_costs): Add new member
> >         stfs.
> >         * config/i386/x86-tune-costs.h (i386_size_cost): Initialize
> >         stfs.
> >         (i386_cost, i486_cost, pentium_cost, lakemont_cost,
> >         pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost,
> >         amdfam10_cost, bdver_cost, znver1_cost, znver2_cost,
> >         znver3_cost, skylake_cost, icelake_cost, alderlake_cost,
> >         btver1_cost, btver2_cost, pentium4_cost, nocano_cost,
> >         atom_cost, slm_cost, tremont_cost, intel_cost, generic_cost,
> >         core_cost): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr101908-1.c: New test.
> >         * gcc.target/i386/pr101908-2.c: New test.
> >         * gcc.target/i386/pr101908-3.c: New test.
> >         * gcc.target/i386/pr101908-v16hi.c: New test.
> >         * gcc.target/i386/pr101908-v16qi.c: New test.
> >         * gcc.target/i386/pr101908-v16sf.c: New test.
> >         * gcc.target/i386/pr101908-v16si.c: New test.
> >         * gcc.target/i386/pr101908-v2df.c: New test.
> >         * gcc.target/i386/pr101908-v2di.c: New test.
> >         * gcc.target/i386/pr101908-v2hi.c: New test.
> >         * gcc.target/i386/pr101908-v2qi.c: New test.
> >         * gcc.target/i386/pr101908-v2sf.c: New test.
> >         * gcc.target/i386/pr101908-v2si.c: New test.
> >         * gcc.target/i386/pr101908-v4df.c: New test.
> >         * gcc.target/i386/pr101908-v4di.c: New test.
> >         * gcc.target/i386/pr101908-v4hi.c: New test.
> >         * gcc.target/i386/pr101908-v4qi.c: New test.
> >         * gcc.target/i386/pr101908-v4sf.c: New test.
> >         * gcc.target/i386/pr101908-v4si.c: New test.
> >         * gcc.target/i386/pr101908-v8df-adl.c: New test.
> >         * gcc.target/i386/pr101908-v8df.c: New test.
> >         * gcc.target/i386/pr101908-v8di-adl.c: New test.
> >         * gcc.target/i386/pr101908-v8di.c: New test.
> >         * gcc.target/i386/pr101908-v8hi-adl.c: New test.
> >         * gcc.target/i386/pr101908-v8hi.c: New test.
> >         * gcc.target/i386/pr101908-v8qi-adl.c: New test.
> >         * gcc.target/i386/pr101908-v8qi.c: New test.
> >         * gcc.target/i386/pr101908-v8sf-adl.c: New test.
> >         * gcc.target/i386/pr101908-v8sf.c: New test.
> >         * gcc.target/i386/pr101908-v8si-adl.c: New test.
> >         * gcc.target/i386/pr101908-v8si.c: New test.
> > ---
> >  gcc/config/i386/i386.cc                       | 51 +++++++++++
> >  gcc/config/i386/i386.h                        |  1 +
> >  gcc/config/i386/x86-tune-costs.h              | 28 ++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-1.c    | 12 +++
> >  gcc/testsuite/gcc.target/i386/pr101908-2.c    | 12 +++
> >  gcc/testsuite/gcc.target/i386/pr101908-3.c    | 90 +++++++++++++++++++
> >  .../gcc.target/i386/pr101908-v16hi.c          |  6 ++
> >  .../gcc.target/i386/pr101908-v16qi.c          | 30 +++++++
> >  .../gcc.target/i386/pr101908-v16sf.c          |  6 ++
> >  .../gcc.target/i386/pr101908-v16si.c          |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v2df.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v2di.c |  7 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v2hi.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v2qi.c | 16 ++++
> >  gcc/testsuite/gcc.target/i386/pr101908-v2sf.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v2si.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v4df.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v4di.c |  7 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v4hi.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v4qi.c | 18 ++++
> >  gcc/testsuite/gcc.target/i386/pr101908-v4sf.c |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v4si.c |  6 ++
> >  .../gcc.target/i386/pr101908-v8df-adl.c       |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v8df.c |  6 ++
> >  .../gcc.target/i386/pr101908-v8di-adl.c       |  7 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v8di.c |  7 ++
> >  .../gcc.target/i386/pr101908-v8hi-adl.c       |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v8hi.c |  6 ++
> >  .../gcc.target/i386/pr101908-v8qi-adl.c       | 22 +++++
> >  gcc/testsuite/gcc.target/i386/pr101908-v8qi.c | 22 +++++
> >  .../gcc.target/i386/pr101908-v8sf-adl.c       |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v8sf.c |  6 ++
> >  .../gcc.target/i386/pr101908-v8si-adl.c       |  6 ++
> >  gcc/testsuite/gcc.target/i386/pr101908-v8si.c |  6 ++
> >  34 files changed, 444 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v16si.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2df.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2di.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v2si.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4df.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4di.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v4si.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8df.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8di.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-v8si.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index d77ad83e437..c01809cc3da 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -22988,6 +22988,46 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
> >    return default_noce_conversion_profitable_p (seq, if_info);
> >  }
> >
> > +/* Return true if REF may have STF issue, otherwise false.
> > +   Any unaligned_load from parm_decl which is passed by stack
> > +   is considered to have STLF stall issue.  */
> > +static bool
> > +ix86_load_maybe_stfs_p (data_reference* dr)
> > +{
> > +  tree addr = DR_BASE_ADDRESS (dr);
> > +  if (TREE_CODE (addr) != ADDR_EXPR)
> > +    return false;
> > +  addr = get_base_address (TREE_OPERAND (addr, 0));
> > +
> > +  if (TREE_CODE (addr) != PARM_DECL)
> > +    return false;
> > +  tree type = TREE_TYPE (addr);
> > +  if (!type)
>
> type should never be NULL
Will change.
>
> > +    return false;
> > +
> > +  machine_mode mode = TYPE_MODE (type);
> > +
> > +  /* There could be false positive in determine parameter passed by stack.
> > +     .i.e. parameter can be put in registers but finally passed by stack
> > +     because registers are ran out.  */
> > +  if (TARGET_64BIT)
> > +    {
> > +      /* From function_arg_64.  */
> > +      enum x86_64_reg_class regclass[MAX_CLASSES];
> > +      int zero_width_bitfields = 0;
> > +      return !classify_argument (mode, type, regclass, 0, zero_width_bitfields);
> > +    }
> > +  else
> > +    {
> > +      /* From function_arg_32.  */
> > +      return (mode == E_BLKmode
> > +             || (AGGREGATE_TYPE_P (type)
> > +                 && (VECTOR_MODE_P (mode) || mode == TImode)));
> > +    }
> > +
> > +  return false;
>
> that stmt is unreachable.
Will change.
>
> > +}
> > +
> >  /* x86-specific vector costs.  */
> >  class ix86_vector_costs : public vector_costs
> >  {
> > @@ -23218,6 +23258,17 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
> >    if (stmt_cost == -1)
> >      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
> >
> > +  /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue.
> > +     Performance may lose when there's no STF issue(1 vector_load vs n
> > +     scalar_load + CTOR).
> > +     TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine
>
> cost(2000) is no longer there
>
> > +     tuned.  */
> > +  if (kind == unaligned_load && stmt_info
> > +      && stmt_info->slp_type == pure_slp
>
> You want to restrict this to BB vectorization?  pure_slp isn't exactly that,
> instead you can do
>
>            && is_a <bb_vec_info> (m_vinfo)
>
> > +      && STMT_VINFO_DATA_REF (stmt_info)
> > +      && ix86_load_maybe_stfs_p (STMT_VINFO_DATA_REF (stmt_info)))
> > +    stmt_cost += COSTS_N_INSNS (ix86_cost->stfs / 2);
>
> I wonder why we divide stfs by two?
Just align with the calculation for costs of vec_load/scalar_load/unalign_load.
>
> I'd suggest an additional check, that the DR is close to function start.  One
> possible check that occurs to me is to check
>
>   STMT_VINFO_DR_INFO (stmt_info)->group == 0
>
> that will for example avoid the penalty for
>
> struct Y y;
> void foo (struct X x)
> {
>   bar();
>   y.a = x.a;
>   y.b = x.b;
> }
>
> but also (maybe not wanted) when the access happens after control
> flow transfer like with
>
> struct Y y;
> void foo (struct X x, int flag)
> {
>   if (flag)
>    {
>     y.a = x.a;
>     y.b = x.b;
>    }
> }
>
> I think we should be conservative with what we pessimize until we have
> evidence that we need to include more cases, also since this after-the-fact
> handling of the issue in costing is sub-optimal.  Ideally the vectorizer itself
> would decide the vectorize the load in a way to avoid STLF fails, but that's
> nothing we can easily arrange for at this stage.
>
> Another option could be to split such loads during md-reorg where we could
Let me try this.
> somehow "count" the latency from function entry, only scanning paths from
> there up to a point where the store buffer is likely not drained (with
> a different
> target cost parameter?) and only scanning not optimize_for_size BBs.  That
> might be a better place to do after-the-fact adjustments (the cost adjustment
> won't avoid the STLF fail if the rest of the vectorization compensates the
> penalty).
>
> Richard.
>
> > +
> >    /* Penalize DFmode vector operations for Bonnell.  */
> >    if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
> >        && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 0d28e57f8f2..341f1c47981 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -168,6 +168,7 @@ struct processor_costs {
> >                                    in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    const int sse_unaligned_load[5];/* cost of unaligned load.  */
> >    const int sse_unaligned_store[5];/* cost of unaligned store.  */
> > +  const int stfs;               /* cost of store forward stalls.  */
> >    const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
> >             zmm_move;
> >    const int sse_to_integer;    /* cost of moving SSE register to integer.  */
> > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > index 017ffa69958..3a5fcdeefdd 100644
> > --- a/gcc/config/i386/x86-tune-costs.h
> > +++ b/gcc/config/i386/x86-tune-costs.h
> > @@ -100,6 +100,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> >                                            in 128bit, 256bit and 512bit */
> >    {3, 3, 3, 3, 3},                     /* cost of unaligned SSE store
> >                                            in 128bit, 256bit and 512bit */
> > +  6,                                   /* cost of store forward stall.  */
> >    3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    5, 0,                                        /* Gather load static, per_elt.  */
> > @@ -209,6 +210,7 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> > +  8,                                   /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -317,6 +319,7 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> > +  8,                                   /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -427,6 +430,7 @@ struct processor_costs pentium_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> > +  8,                                   /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -528,6 +532,7 @@ struct processor_costs lakemont_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> > +  8,                                   /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -644,6 +649,7 @@ struct processor_costs pentiumpro_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
> >    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
> > +  24,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -751,6 +757,7 @@ struct processor_costs geode_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
> >    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
> > +  14,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    2, 2,                                        /* Gather load static, per_elt.  */
> > @@ -858,6 +865,7 @@ struct processor_costs k6_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
> >    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
> > +  24,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    2, 2,                                        /* Gather load static, per_elt.  */
> > @@ -971,6 +979,7 @@ struct processor_costs athlon_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 4, 12, 12, 24},                  /* cost of unaligned loads.  */
> >    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
> > +  14,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    5,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -1086,6 +1095,7 @@ struct processor_costs k8_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 3, 12, 12, 24},                  /* cost of unaligned loads.  */
> >    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
> > +  14,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    5,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -1214,6 +1224,7 @@ struct processor_costs amdfam10_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
> >    {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
> > +  21,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    3,                                   /* cost of moving SSE register to integer.  */
> >    4, 4,                                        /* Gather load static, per_elt.  */
> > @@ -1334,6 +1345,7 @@ const struct processor_costs bdver_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {12, 12, 10, 40, 60},                        /* cost of unaligned loads.  */
> >    {10, 10, 10, 40, 60},                        /* cost of unaligned stores.  */
> > +  54,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    16,                                  /* cost of moving SSE register to integer.  */
> >    12, 12,                              /* Gather load static, per_elt.  */
> > @@ -1475,6 +1487,7 @@ struct processor_costs znver1_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 12, 24},                   /* cost of unaligned loads.  */
> >    {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
> > +  42,                                  /* cost of store forward stall.  */
> >    2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > @@ -1630,6 +1643,7 @@ struct processor_costs znver2_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
> >    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> > +  42,                                  /* cost of store forward stall.  */
> >    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
> >                                            register.  */
> >    6,                                   /* cost of moving SSE register to integer.  */
> > @@ -1762,6 +1776,7 @@ struct processor_costs znver3_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
> >    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> > +  42,                                  /* cost of store forward stall.  */
> >    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
> >                                            register.  */
> >    6,                                   /* cost of moving SSE register to integer.  */
> > @@ -1907,6 +1922,7 @@ struct processor_costs skylake_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
> >    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> > +  26,                                  /* cost of store forward stall.  */
> >    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    20, 8,                               /* Gather load static, per_elt.  */
> > @@ -2033,6 +2049,7 @@ struct processor_costs icelake_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
> >    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> > +  26,                                  /* cost of store forward stall.  */
> >    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    20, 8,                               /* Gather load static, per_elt.  */
> > @@ -2153,6 +2170,7 @@ struct processor_costs alderlake_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
> >    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> > +  90,                                  /* cost of store forward stall.  */
> >    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    18, 6,                               /* Gather load static, per_elt.  */
> > @@ -2266,6 +2284,7 @@ const struct processor_costs btver1_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
> >    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
> > +  36,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    14,                                  /* cost of moving SSE register to integer.  */
> >    10, 10,                              /* Gather load static, per_elt.  */
> > @@ -2376,6 +2395,7 @@ const struct processor_costs btver2_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
> >    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
> > +  36,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    14,                                  /* cost of moving SSE register to integer.  */
> >    10, 10,                              /* Gather load static, per_elt.  */
> > @@ -2485,6 +2505,7 @@ struct processor_costs pentium4_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {32, 32, 32, 64, 128},               /* cost of unaligned loads.  */
> >    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
> > +  10,                                  /* cost of store forward stall.  */
> >    12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
> >    20,                                  /* cost of moving SSE register to integer.  */
> >    16, 16,                              /* Gather load static, per_elt.  */
> > @@ -2597,6 +2618,7 @@ struct processor_costs nocona_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {24, 24, 24, 48, 96},                        /* cost of unaligned loads.  */
> >    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
> > +  8,                                   /* cost of store forward stall.  */
> >    6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
> >    20,                                  /* cost of moving SSE register to integer.  */
> >    12, 12,                              /* Gather load static, per_elt.  */
> > @@ -2707,6 +2729,7 @@ struct processor_costs atom_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
> >    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
> > +  32,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    8,                                   /* cost of moving SSE register to integer.  */
> >    8, 8,                                        /* Gather load static, per_elt.  */
> > @@ -2817,6 +2840,7 @@ struct processor_costs slm_cost = {
> >                                            in SImode, DImode and TImode.  */
> >    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
> >    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
> > +  48,                                  /* cost of store forward stall.  */
> >    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
> >    8,                                   /* cost of moving SSE register to integer.  */
> >    8, 8,                                        /* Gather load static, per_elt.  */
> > @@ -2939,6 +2963,7 @@ struct processor_costs tremont_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
> >    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> > +  42,                                  /* cost of store forward stall.  */
> >    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    18, 6,                               /* Gather load static, per_elt.  */
> > @@ -3051,6 +3076,7 @@ struct processor_costs intel_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
> >    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
> > +  22,                                  /* cost of store forward stall.  */
> >    2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
> >    4,                                   /* cost of moving SSE register to integer.  */
> >    6, 6,                                        /* Gather load static, per_elt.  */
> > @@ -3168,6 +3194,7 @@ struct processor_costs generic_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
> >    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> > +  54,                                  /* cost of store forward stall.  */
> >    2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
> >    6,                                   /* cost of moving SSE register to integer.  */
> >    18, 6,                               /* Gather load static, per_elt.  */
> > @@ -3291,6 +3318,7 @@ struct processor_costs core_cost = {
> >                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
> >    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
> >    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
> > +  26,                                  /* cost of store forward stall.  */
> >    2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
> >    2,                                   /* cost of moving SSE register to integer.  */
> >    /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > new file mode 100644
> > index 00000000000..f8e0f2e26bb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> > +
> > +struct X { double x[2]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X* x, struct X* y)
> > +{
> > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > new file mode 100644
> > index 00000000000..f4ff7a83c82
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> > +
> > +struct X { double x[4]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X x, struct X y)
> > +{
> > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
> > new file mode 100644
> > index 00000000000..6f853aa7750
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
> > @@ -0,0 +1,90 @@
> > +/* PR target/101908.  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -O2 -mtune=generic -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not "add new stmt:.*MEM \<vector(2) double\>.*ray + 24B" "slp2" } }  */
> > +/* This testcase is used to avoid STLF stall.  */
> > +
> > +#define sqrt __builtin_sqrt
> > +#define SQ(x)          ((x) * (x))
> > +struct vec3 {
> > +  double x, y, z;
> > +};
> > +
> > +struct ray {
> > +  struct vec3 orig, dir;
> > +};
> > +
> > +struct material {
> > +  struct vec3 col;     /* color */
> > +  double spow;         /* specular power */
> > +  double refl;         /* reflection intensity */
> > +};
> > +
> > +struct sphere {
> > +  struct vec3 pos;
> > +  double rad;
> > +  struct material mat;
> > +  struct sphere *next;
> > +};
> > +
> > +struct spoint {
> > +  struct vec3 pos, normal, vref;       /* position, normal and view reflection */
> > +  double dist;         /* parametric distance of intersection along the ray */
> > +};
> > +
> > +#define ERR_MARGIN             1e-6
> > +
> > +#define DOT(a, b)      ((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
> > +#define NORMALIZE(a)  do {                     \
> > +    double len = sqrt(DOT(a, a));              \
> > +    (a).x /= len; (a).y /= len; (a).z /= len;  \
> > +  } while(0);
> > +
> > +static struct vec3
> > +reflect(struct vec3 v, struct vec3 n) {
> > +  struct vec3 res;
> > +  double dot = v.x * n.x + v.y * n.y + v.z * n.z;
> > +  res.x = -(2.0 * dot * n.x - v.x);
> > +  res.y = -(2.0 * dot * n.y - v.y);
> > +  res.z = -(2.0 * dot * n.z - v.z);
> > +  return res;
> > +}
> > +
> > +int ray_sphere(const struct sphere *sph,
> > +              struct ray ray, struct spoint *sp) {
> > +  double a, b, c, d, sqrt_d, t1, t2;
> > +
> > +  a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
> > +  b = 2.0 * ray.dir.x * (ray.orig.x - sph->pos.x) +
> > +    2.0 * ray.dir.y * (ray.orig.y - sph->pos.y) +
> > +    2.0 * ray.dir.z * (ray.orig.z - sph->pos.z);
> > +  c = SQ(sph->pos.x) + SQ(sph->pos.y) + SQ(sph->pos.z) +
> > +    SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
> > +    2.0 * (-sph->pos.x * ray.orig.x - sph->pos.y * ray.orig.y - sph->pos.z * ray.orig.z) - SQ(sph->rad);
> > +
> > +  if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
> > +
> > +  sqrt_d = sqrt(d);
> > +  t1 = (-b + sqrt_d) / (2.0 * a);
> > +  t2 = (-b - sqrt_d) / (2.0 * a);
> > +
> > +  if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
> > +
> > +  if(sp) {
> > +    if(t1 < ERR_MARGIN) t1 = t2;
> > +    if(t2 < ERR_MARGIN) t2 = t1;
> > +    sp->dist = t1 < t2 ? t1 : t2;
> > +
> > +    sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
> > +    sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
> > +    sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
> > +
> > +    sp->normal.x = (sp->pos.x - sph->pos.x) / sph->rad;
> > +    sp->normal.y = (sp->pos.y - sph->pos.y) / sph->rad;
> > +    sp->normal.z = (sp->pos.z - sph->pos.z) / sph->rad;
> > +
> > +    sp->vref = reflect(ray.dir, sp->normal);
> > +    NORMALIZE(sp->vref);
> > +  }
> > +  return 1;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
> > new file mode 100644
> > index 00000000000..fcd3ee8122f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16hi.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) short int\>} "slp2" } } */
> > +
> > +#define TYPE short
> > +#include "pr101908-v16qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
> > new file mode 100644
> > index 00000000000..6d43788600e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16qi.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3  -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) char\>} "slp2" } } */
> > +
> > +#ifndef TYPE
> > +#define TYPE char
> > +#endif
> > +
> > +struct X { TYPE a[128]; };
> > +
> > +void __attribute__((noipa))
> > +foo16 (struct X x, struct X y, TYPE* __restrict p)
> > +{
> > +  p[0] = x.a[1] + y.a[1];
> > +  p[1] = x.a[2] + y.a[2];
> > +  p[2] = x.a[3] + y.a[3];
> > +  p[3] = x.a[4] + y.a[4];
> > +  p[4] = x.a[5] + y.a[5];
> > +  p[5] = x.a[6] + y.a[6];
> > +  p[6] = x.a[7] + y.a[7];
> > +  p[7] = x.a[8] + y.a[8];
> > +  p[8] = x.a[9] + y.a[9];
> > +  p[9] = x.a[10] + y.a[10];
> > +  p[10] = x.a[11] + y.a[11];
> > +  p[11] = x.a[12] + y.a[12];
> > +  p[12] = x.a[13] + y.a[13];
> > +  p[13] = x.a[14] + y.a[14];
> > +  p[14] = x.a[15] + y.a[15];
> > +  p[15] = x.a[16] + y.a[16];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
> > new file mode 100644
> > index 00000000000..f95b85abbc6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16sf.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) float\>} "slp2" } } */
> > +
> > +#define TYPE float
> > +#include "pr101908-v16qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v16si.c b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
> > new file mode 100644
> > index 00000000000..5c48aa5da69
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v16si.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx512f -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(16\) int\>} "slp2" } } */
> > +
> > +#define TYPE int
> > +#include "pr101908-v16qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2df.c b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
> > new file mode 100644
> > index 00000000000..9d3f157718c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2df.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) double\>} "slp2" } } */
> > +
> > +#define TYPE double
> > +#include "pr101908-v2qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2di.c b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
> > new file mode 100644
> > index 00000000000..c7cf9a71f21
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2di.c
> > @@ -0,0 +1,7 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) long long int\>} "slp2" } } */
> > +
> > +typedef long long int64_t;
> > +#define TYPE int64_t
> > +#include "pr101908-v2qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
> > new file mode 100644
> > index 00000000000..e6024d70780
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2hi.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) short int\>} "slp2" } } */
> > +
> > +#define TYPE short
> > +#include "pr101908-v2qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
> > new file mode 100644
> > index 00000000000..cf876cc70d4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2qi.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) char\>} "slp2" } } */
> > +
> > +#ifndef TYPE
> > +#define TYPE char
> > +#endif
> > +
> > +struct X { TYPE a[128]; };
> > +
> > +void __attribute__((noipa))
> > +foo16 (struct X x, struct X y, TYPE* __restrict p)
> > +{
> > +  p[14] = x.a[15] + y.a[15];
> > +  p[15] = x.a[16] + y.a[16];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
> > new file mode 100644
> > index 00000000000..eb6349b957e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2sf.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) float\>} "slp2" } } */
> > +
> > +#define TYPE float
> > +#include "pr101908-v2qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v2si.c b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
> > new file mode 100644
> > index 00000000000..ae5fa0749c6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v2si.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) int\>} "slp2" } } */
> > +
> > +#define TYPE int
> > +#include "pr101908-v2qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4df.c b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
> > new file mode 100644
> > index 00000000000..94497422704
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4df.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) double\>} "slp2" } } */
> > +
> > +#define TYPE double
> > +#include "pr101908-v4qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4di.c b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
> > new file mode 100644
> > index 00000000000..71407aa9fc7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4di.c
> > @@ -0,0 +1,7 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) long long int\>} "slp2" } } */
> > +
> > +typedef long long int64_t;
> > +#define TYPE int64_t
> > +#include "pr101908-v4qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
> > new file mode 100644
> > index 00000000000..4b207b91225
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4hi.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) short int\>} "slp2" } } */
> > +
> > +#define TYPE short
> > +#include "pr101908-v4qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
> > new file mode 100644
> > index 00000000000..5292d3442ec
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4qi.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) char\>} "slp2" } } */
> > +
> > +#ifndef TYPE
> > +#define TYPE char
> > +#endif
> > +
> > +struct X { TYPE a[128]; };
> > +
> > +void __attribute__((noipa))
> > +foo16 (struct X x, struct X y, TYPE* __restrict p)
> > +{
> > +  p[12] = x.a[13] + y.a[13];
> > +  p[13] = x.a[14] + y.a[14];
> > +  p[14] = x.a[15] + y.a[15];
> > +  p[15] = x.a[16] + y.a[16];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
> > new file mode 100644
> > index 00000000000..a2c6273120d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4sf.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) float\>} "slp2" } } */
> > +
> > +#define TYPE float
> > +#include "pr101908-v4qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v4si.c b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
> > new file mode 100644
> > index 00000000000..c6824285c74
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v4si.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(4\) int\>} "slp2" } } */
> > +
> > +#define TYPE int
> > +#include "pr101908-v4qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
> > new file mode 100644
> > index 00000000000..248c6d0fb91
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df-adl.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
> > +
> > +#define TYPE double
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8df.c b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
> > new file mode 100644
> > index 00000000000..05eb2dd51d0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8df.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) double\>} "slp2" } } */
> > +
> > +#define TYPE double
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
> > new file mode 100644
> > index 00000000000..b0055d7d2c0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di-adl.c
> > @@ -0,0 +1,7 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx512f -mtune=alderlake -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
> > +
> > +typedef long long int64_t;
> > +#define TYPE int64_t
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8di.c b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
> > new file mode 100644
> > index 00000000000..76a393bcc6c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8di.c
> > @@ -0,0 +1,7 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx512f -mtune=generic -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) long long int\>} "slp2" } } */
> > +
> > +typedef long long int64_t;
> > +#define TYPE int64_t
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
> > new file mode 100644
> > index 00000000000..28977adae28
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi-adl.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
> > +
> > +#define TYPE short
> > +#include "pr101908-v8qi-adl.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
> > new file mode 100644
> > index 00000000000..89b50885366
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8hi.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) short int\>} "slp2" } } */
> > +
> > +#define TYPE short
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
> > new file mode 100644
> > index 00000000000..be668e5d006
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi-adl.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O3 -march=x86-64 -mtune=alderlake -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
> > +
> > +#ifndef TYPE
> > +#define TYPE char
> > +#endif
> > +
> > +struct X { TYPE a[128]; };
> > +
> > +void __attribute__((noipa))
> > +foo16 (struct X x, struct X y, TYPE* __restrict p)
> > +{
> > +  p[8] = x.a[9] + y.a[9];
> > +  p[9] = x.a[10] + y.a[10];
> > +  p[10] = x.a[11] + y.a[11];
> > +  p[11] = x.a[12] + y.a[12];
> > +  p[12] = x.a[13] + y.a[13];
> > +  p[13] = x.a[14] + y.a[14];
> > +  p[14] = x.a[15] + y.a[15];
> > +  p[15] = x.a[16] + y.a[16];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
> > new file mode 100644
> > index 00000000000..842c88c8952
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8qi.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O3 -march=x86-64 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) char\>} "slp2" } } */
> > +
> > +#ifndef TYPE
> > +#define TYPE char
> > +#endif
> > +
> > +struct X { TYPE a[128]; };
> > +
> > +void __attribute__((noipa))
> > +foo16 (struct X x, struct X y, TYPE* __restrict p)
> > +{
> > +  p[8] = x.a[9] + y.a[9];
> > +  p[9] = x.a[10] + y.a[10];
> > +  p[10] = x.a[11] + y.a[11];
> > +  p[11] = x.a[12] + y.a[12];
> > +  p[12] = x.a[13] + y.a[13];
> > +  p[13] = x.a[14] + y.a[14];
> > +  p[14] = x.a[15] + y.a[15];
> > +  p[15] = x.a[16] + y.a[16];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
> > new file mode 100644
> > index 00000000000..89d33566a40
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf-adl.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
> > +
> > +#define TYPE float
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
> > new file mode 100644
> > index 00000000000..81557c7b9b7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8sf.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) float\>} "slp2" } } */
> > +
> > +#define TYPE float
> > +#include "pr101908-v8qi.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
> > new file mode 100644
> > index 00000000000..883956a0d49
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si-adl.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -mtune=alderlake -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
> > +
> > +#define TYPE int
> > +#include "pr101908-v8qi-adl.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-v8si.c b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
> > new file mode 100644
> > index 00000000000..142f46012d7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-v8si.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=x86-64 -mavx2 -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump {(?n)add new stmt: vect.*MEM \<vector\(8\) int\>} "slp2" } } */
> > +
> > +#define TYPE int
> > +#include "pr101908-v8qi.c"
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-03-17  7:12 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-04  7:27 [PATCH] [i386] Prevent vectorization for load from parm_decl at O2 to avoid STF issue liuhongt
2022-03-07  8:26 ` Hongtao Liu
2022-03-07  9:37 ` Richard Biener
2022-03-08  3:39   ` Hongtao Liu
2022-03-15  9:13     ` [PATCH] [i386] Add extra cost for unsigned_load which may have stall forward issue liuhongt
2022-03-16  1:03     ` liuhongt
2022-03-16  2:19     ` liuhongt
2022-03-16  9:54       ` Richard Biener
2022-03-17  7:12         ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).