* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
[not found] <20210726084723.49443-1-hongtao.liu@intel.com>
@ 2021-07-26 8:49 ` Hongtao Liu
2021-07-27 1:54 ` Hongtao Liu
0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2021-07-26 8:49 UTC (permalink / raw)
To: liuhongt; +Cc: Uros Bizjak, H. J. Lu, GCC Patches
Correct mail list, please reply under this email.
On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> Hi:
> As decribled in PR, the pinsr instruction has poor throughput in SKX
> and CLX, which leads to worse performance in vectorization in some cases.
> This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> which is used by vector construction, the cost is same as sse_op on other
> targets, but twice much as sse_op on CLX/SKX.
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/99881
> * config/i386/i386.h (processor_costs): Add new member
> integer_to_sse.
> * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> generic_cost, core_cost): Initialize integer_to_sse same value
> as sse_op.
> (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> * config/i386/i386.c (ix86_builtin_vectorization_cost):
> Use integer_to_sse instead of sse_op to calculate the cost of
> vec_construct.
>
> gcc/testsuite/ChangeLog:
>
> PR target/99881
> * gcc.target/i386/pr99881.c: New test.
> ---
> gcc/config/i386/i386.c | 6 ++-
> gcc/config/i386/i386.h | 1 +
> gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> 4 files changed, 81 insertions(+), 1 deletion(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ff96134fb37..fbebd2d8f9a 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> case vec_construct:
> {
> /* N element inserts into SSE vectors. */
> - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> + int cost
> + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> + ix86_cost->sse_op
> + : ix86_cost->integer_to_sse);
> +
> /* One vinserti128 for combining two SSE vectors for AVX256. */
> if (GET_MODE_BITSIZE (mode) == 256)
> cost += ix86_vec_cost (mode, ix86_cost->addss);
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 0c2c93daf32..d1e1c225990 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -165,6 +165,7 @@ struct processor_costs {
> const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> zmm_move;
> const int sse_to_integer; /* cost of moving SSE register to integer. */
> + const int integer_to_sse; /* cost of moving integer to SSE register. */
> const int gather_static, gather_per_elt; /* Cost of gather load is computed
> as static + per_item * nelts. */
> const int scatter_static, scatter_per_elt; /* Cost of gather store is
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index ffe810f2bcb..67cfa006196 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> in 128bit, 256bit and 512bit */
> 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> 5, 0, /* Gather load static, per_elt. */
> 5, 0, /* Gather store static, per_elt. */
> 0, /* size of l1 cache */
> @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 0, /* size of l1 cache */
> @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 4, /* size of l1 cache. 486 has 8kB cache
> @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 2, 2, /* Gather load static, per_elt. */
> 2, 2, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 2, 2, /* Gather load static, per_elt. */
> 2, 2, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 5, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 5, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 3, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> 4, 4, /* Gather load static, per_elt. */
> 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 16, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> 12, 12, /* Gather load static, per_elt. */
> 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> throughput 12. Approx 9 uops do not depend on vector size and every load
> is 7 uops. */
> @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> register. */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> throughput 12. Approx 9 uops do not depend on vector size and every load
> is 7 uops. */
> @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> register. */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> throughput 9. Approx 7 uops do not depend on vector size and every load
> is 4 uops. */
> @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> 20, 8, /* Gather load static, per_elt. */
> 22, 10, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 20, 8, /* Gather load static, per_elt. */
> 22, 10, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 14, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 10, 10, /* Gather load static, per_elt. */
> 10, 10, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 14, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 10, 10, /* Gather load static, per_elt. */
> 10, 10, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> 20, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> 16, 16, /* Gather load static, per_elt. */
> 16, 16, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> 20, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> 12, 12, /* Gather load static, per_elt. */
> 12, 12, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 8, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 8, 8, /* Gather load static, per_elt. */
> 8, 8, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> 8, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 8, 8, /* Gather load static, per_elt. */
> 8, 8, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> 4, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 6, 6, /* Gather load static, per_elt. */
> 6, 6, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> 6, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> 18, 6, /* Gather load static, per_elt. */
> 18, 6, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> 2, /* cost of moving SSE register to integer. */
> + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> rec. throughput 6.
> So 5 uops statically and one uops per load. */
> diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> new file mode 100644
> index 00000000000..7ae51c8310d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> @@ -0,0 +1,49 @@
> +/* PR target/99881. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=skylake" } */
> +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> +
> +void
> +foo (int* __restrict a, int n, int c)
> +{
> + a[0] = n;
> + a[1] = c;
> +}
> +
> +void
> +foo1 (int* __restrict a, int n, int b, int c, int d)
> +{
> + a[0] = n;
> + a[1] = b;
> + a[2] = c;
> + a[3] = d;
> +}
> +
> +void
> +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> +{
> + a[0] = n;
> + a[1] = b;
> + a[2] = c;
> + a[3] = d;
> + a[4] = e;
> + a[5] = f;
> + a[6] = g;
> + a[7] = h;
> +}
> +
> +void
> +foo3 (long long* __restrict a, long long n, long long c)
> +{
> + a[0] = n;
> + a[1] = c;
> +}
> +
> +void
> +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> +{
> + a[0] = n;
> + a[1] = b;
> + a[2] = c;
> + a[3] = d;
> +}
> --
> 2.18.1
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
2021-07-26 8:49 ` [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct Hongtao Liu
@ 2021-07-27 1:54 ` Hongtao Liu
2021-07-28 2:54 ` Hongtao Liu
0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2021-07-27 1:54 UTC (permalink / raw)
To: liuhongt; +Cc: Uros Bizjak, H. J. Lu, GCC Patches
On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> Correct mail list, please reply under this email.
>
> On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Hi:
> > As decribled in PR, the pinsr instruction has poor throughput in SKX
> > and CLX, which leads to worse performance in vectorization in some cases.
> > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > which is used by vector construction, the cost is same as sse_op on other
> > targets, but twice much as sse_op on CLX/SKX.
> > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > Ok for trunk?
> >
I'm going to check in this patch if there's no objection.
> > gcc/ChangeLog:
> >
> > PR target/99881
> > * config/i386/i386.h (processor_costs): Add new member
> > integer_to_sse.
> > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > generic_cost, core_cost): Initialize integer_to_sse same value
> > as sse_op.
> > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > Use integer_to_sse instead of sse_op to calculate the cost of
> > vec_construct.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/99881
> > * gcc.target/i386/pr99881.c: New test.
> > ---
> > gcc/config/i386/i386.c | 6 ++-
> > gcc/config/i386/i386.h | 1 +
> > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> > 4 files changed, 81 insertions(+), 1 deletion(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index ff96134fb37..fbebd2d8f9a 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > case vec_construct:
> > {
> > /* N element inserts into SSE vectors. */
> > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > + int cost
> > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > + ix86_cost->sse_op
> > + : ix86_cost->integer_to_sse);
> > +
> > /* One vinserti128 for combining two SSE vectors for AVX256. */
> > if (GET_MODE_BITSIZE (mode) == 256)
> > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 0c2c93daf32..d1e1c225990 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -165,6 +165,7 @@ struct processor_costs {
> > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> > zmm_move;
> > const int sse_to_integer; /* cost of moving SSE register to integer. */
> > + const int integer_to_sse; /* cost of moving integer to SSE register. */
> > const int gather_static, gather_per_elt; /* Cost of gather load is computed
> > as static + per_item * nelts. */
> > const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > index ffe810f2bcb..67cfa006196 100644
> > --- a/gcc/config/i386/x86-tune-costs.h
> > +++ b/gcc/config/i386/x86-tune-costs.h
> > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > in 128bit, 256bit and 512bit */
> > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> > 5, 0, /* Gather load static, per_elt. */
> > 5, 0, /* Gather store static, per_elt. */
> > 0, /* size of l1 cache */
> > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 0, /* size of l1 cache */
> > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 4, /* size of l1 cache. 486 has 8kB cache
> > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 2, 2, /* Gather load static, per_elt. */
> > 2, 2, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 2, 2, /* Gather load static, per_elt. */
> > 2, 2, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 5, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 5, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 3, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > 4, 4, /* Gather load static, per_elt. */
> > 4, 4, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 16, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > 12, 12, /* Gather load static, per_elt. */
> > 10, 10, /* Gather store static, per_elt. */
> > 16, /* size of l1 cache. */
> > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > throughput 12. Approx 9 uops do not depend on vector size and every load
> > is 7 uops. */
> > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > register. */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > throughput 12. Approx 9 uops do not depend on vector size and every load
> > is 7 uops. */
> > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > register. */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> > throughput 9. Approx 7 uops do not depend on vector size and every load
> > is 4 uops. */
> > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> > 20, 8, /* Gather load static, per_elt. */
> > 22, 10, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 20, 8, /* Gather load static, per_elt. */
> > 22, 10, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 14, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 10, 10, /* Gather load static, per_elt. */
> > 10, 10, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 14, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 10, 10, /* Gather load static, per_elt. */
> > 10, 10, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> > 20, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > 16, 16, /* Gather load static, per_elt. */
> > 16, 16, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> > 20, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > 12, 12, /* Gather load static, per_elt. */
> > 12, 12, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 8, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 8, 8, /* Gather load static, per_elt. */
> > 8, 8, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > 8, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 8, 8, /* Gather load static, per_elt. */
> > 8, 8, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> > 4, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 6, 6, /* Gather load static, per_elt. */
> > 6, 6, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> > 6, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > 18, 6, /* Gather load static, per_elt. */
> > 18, 6, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > 2, /* cost of moving SSE register to integer. */
> > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > rec. throughput 6.
> > So 5 uops statically and one uops per load. */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> > new file mode 100644
> > index 00000000000..7ae51c8310d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> > @@ -0,0 +1,49 @@
> > +/* PR target/99881. */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Ofast -march=skylake" } */
> > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> > +
> > +void
> > +foo (int* __restrict a, int n, int c)
> > +{
> > + a[0] = n;
> > + a[1] = c;
> > +}
> > +
> > +void
> > +foo1 (int* __restrict a, int n, int b, int c, int d)
> > +{
> > + a[0] = n;
> > + a[1] = b;
> > + a[2] = c;
> > + a[3] = d;
> > +}
> > +
> > +void
> > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> > +{
> > + a[0] = n;
> > + a[1] = b;
> > + a[2] = c;
> > + a[3] = d;
> > + a[4] = e;
> > + a[5] = f;
> > + a[6] = g;
> > + a[7] = h;
> > +}
> > +
> > +void
> > +foo3 (long long* __restrict a, long long n, long long c)
> > +{
> > + a[0] = n;
> > + a[1] = c;
> > +}
> > +
> > +void
> > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> > +{
> > + a[0] = n;
> > + a[1] = b;
> > + a[2] = c;
> > + a[3] = d;
> > +}
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
2021-07-27 1:54 ` Hongtao Liu
@ 2021-07-28 2:54 ` Hongtao Liu
2021-08-03 9:20 ` Richard Biener
0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2021-07-28 2:54 UTC (permalink / raw)
To: liuhongt; +Cc: Uros Bizjak, H. J. Lu, GCC Patches
On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > Correct mail list, please reply under this email.
> >
> > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > Hi:
> > > As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > and CLX, which leads to worse performance in vectorization in some cases.
> > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > > which is used by vector construction, the cost is same as sse_op on other
> > > targets, but twice much as sse_op on CLX/SKX.
> > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > Ok for trunk?
> > >
> I'm going to check in this patch if there's no objection.
Pushed to trunk.
> > > gcc/ChangeLog:
> > >
> > > PR target/99881
> > > * config/i386/i386.h (processor_costs): Add new member
> > > integer_to_sse.
> > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > as sse_op.
> > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > vec_construct.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR target/99881
> > > * gcc.target/i386/pr99881.c: New test.
> > > ---
> > > gcc/config/i386/i386.c | 6 ++-
> > > gcc/config/i386/i386.h | 1 +
> > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> > > 4 files changed, 81 insertions(+), 1 deletion(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > >
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index ff96134fb37..fbebd2d8f9a 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > > case vec_construct:
> > > {
> > > /* N element inserts into SSE vectors. */
> > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > + int cost
> > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > + ix86_cost->sse_op
> > > + : ix86_cost->integer_to_sse);
> > > +
> > > /* One vinserti128 for combining two SSE vectors for AVX256. */
> > > if (GET_MODE_BITSIZE (mode) == 256)
> > > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index 0c2c93daf32..d1e1c225990 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -165,6 +165,7 @@ struct processor_costs {
> > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> > > zmm_move;
> > > const int sse_to_integer; /* cost of moving SSE register to integer. */
> > > + const int integer_to_sse; /* cost of moving integer to SSE register. */
> > > const int gather_static, gather_per_elt; /* Cost of gather load is computed
> > > as static + per_item * nelts. */
> > > const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > index ffe810f2bcb..67cfa006196 100644
> > > --- a/gcc/config/i386/x86-tune-costs.h
> > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > > in 128bit, 256bit and 512bit */
> > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> > > 5, 0, /* Gather load static, per_elt. */
> > > 5, 0, /* Gather store static, per_elt. */
> > > 0, /* size of l1 cache */
> > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 0, /* size of l1 cache */
> > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 4, /* size of l1 cache. 486 has 8kB cache
> > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 8, /* size of l1 cache. */
> > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 8, /* size of l1 cache. */
> > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 8, /* size of l1 cache. */
> > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 2, 2, /* Gather load static, per_elt. */
> > > 2, 2, /* Gather store static, per_elt. */
> > > 64, /* size of l1 cache. */
> > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 2, 2, /* Gather load static, per_elt. */
> > > 2, 2, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 5, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 64, /* size of l1 cache. */
> > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 5, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 64, /* size of l1 cache. */
> > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 3, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > 4, 4, /* Gather load static, per_elt. */
> > > 4, 4, /* Gather store static, per_elt. */
> > > 64, /* size of l1 cache. */
> > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 16, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > 12, 12, /* Gather load static, per_elt. */
> > > 10, 10, /* Gather store static, per_elt. */
> > > 16, /* size of l1 cache. */
> > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > is 7 uops. */
> > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > register. */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > is 7 uops. */
> > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > register. */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> > > throughput 9. Approx 7 uops do not depend on vector size and every load
> > > is 4 uops. */
> > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> > > 20, 8, /* Gather load static, per_elt. */
> > > 22, 10, /* Gather store static, per_elt. */
> > > 64, /* size of l1 cache. */
> > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 20, 8, /* Gather load static, per_elt. */
> > > 22, 10, /* Gather store static, per_elt. */
> > > 64, /* size of l1 cache. */
> > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 14, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 10, 10, /* Gather load static, per_elt. */
> > > 10, 10, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 14, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 10, 10, /* Gather load static, per_elt. */
> > > 10, 10, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> > > 20, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > 16, 16, /* Gather load static, per_elt. */
> > > 16, 16, /* Gather store static, per_elt. */
> > > 8, /* size of l1 cache. */
> > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> > > 20, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > 12, 12, /* Gather load static, per_elt. */
> > > 12, 12, /* Gather store static, per_elt. */
> > > 8, /* size of l1 cache. */
> > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 8, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 8, 8, /* Gather load static, per_elt. */
> > > 8, 8, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > 8, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 8, 8, /* Gather load static, per_elt. */
> > > 8, 8, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> > > 4, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 6, 6, /* Gather load static, per_elt. */
> > > 6, 6, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> > > 6, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > 18, 6, /* Gather load static, per_elt. */
> > > 18, 6, /* Gather store static, per_elt. */
> > > 32, /* size of l1 cache. */
> > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > 2, /* cost of moving SSE register to integer. */
> > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > > rec. throughput 6.
> > > So 5 uops statically and one uops per load. */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > new file mode 100644
> > > index 00000000000..7ae51c8310d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > @@ -0,0 +1,49 @@
> > > +/* PR target/99881. */
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-Ofast -march=skylake" } */
> > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> > > +
> > > +void
> > > +foo (int* __restrict a, int n, int c)
> > > +{
> > > + a[0] = n;
> > > + a[1] = c;
> > > +}
> > > +
> > > +void
> > > +foo1 (int* __restrict a, int n, int b, int c, int d)
> > > +{
> > > + a[0] = n;
> > > + a[1] = b;
> > > + a[2] = c;
> > > + a[3] = d;
> > > +}
> > > +
> > > +void
> > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> > > +{
> > > + a[0] = n;
> > > + a[1] = b;
> > > + a[2] = c;
> > > + a[3] = d;
> > > + a[4] = e;
> > > + a[5] = f;
> > > + a[6] = g;
> > > + a[7] = h;
> > > +}
> > > +
> > > +void
> > > +foo3 (long long* __restrict a, long long n, long long c)
> > > +{
> > > + a[0] = n;
> > > + a[1] = c;
> > > +}
> > > +
> > > +void
> > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> > > +{
> > > + a[0] = n;
> > > + a[1] = b;
> > > + a[2] = c;
> > > + a[3] = d;
> > > +}
> > > --
> > > 2.18.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
2021-07-28 2:54 ` Hongtao Liu
@ 2021-08-03 9:20 ` Richard Biener
2021-08-03 10:20 ` Richard Biener
0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2021-08-03 9:20 UTC (permalink / raw)
To: Hongtao Liu; +Cc: liuhongt, GCC Patches
On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > Correct mail list, please reply under this email.
> > >
> > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > Hi:
> > > > As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > > and CLX, which leads to worse performance in vectorization in some cases.
> > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > > > which is used by vector construction, the cost is same as sse_op on other
> > > > targets, but twice much as sse_op on CLX/SKX.
> > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > > Ok for trunk?
> > > >
> > I'm going to check in this patch if there's no objection.
> Pushed to trunk.
/* N element inserts into SSE vectors. */
- int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
+ int cost
+ = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
+ ix86_cost->sse_op
+ : ix86_cost->integer_to_sse);
+
so that's costing movd and pinsr the same, shouldn't we try to separate this
by doing
/* N element inserts into SSE vectors. */
int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
/* Account for int->SSE reg moves. */
if (!fp)
cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse;
? pinsr is only supported with SSE4+ IIRC. Note we also have
case vec_to_scalar:
case scalar_to_vec:
return ix86_vec_cost (mode, ix86_cost->sse_op);
where scalar_to_vec is used to cost splats and vec_to_scalar is used
to cost element extracts. Both lack costing of the move part.
I realize we have GPR to XMM inserts which cover both the "move" and
the insert but then calling this 'integer_to_sse' is a bit odd. The extract
cost also depends on the element number for AVX2/AVX512F. The
vectorizer usually decomposes a vector fully and never does single
element extracts so the vextract128 cost amortizes.
That said, the change leaves all targets besides skylake_cost with
not so great defaults I think. For skylake you effectively add another
sse_op for the int->SSE move plus '1' (for whatever reason). I think
that's reasonable for all targets.
It does look a bit odd to have
8, /* cost of moving SSE register to intege
r. */
COSTS_N_INSNS (1), /* cost of moving integer to sse registe
r. */
where sse_to_integer is used by the STV pass which mixes
CONST_N_INSNS scaled costs and unscaled costs (ick).
Richard.
> > > > gcc/ChangeLog:
> > > >
> > > > PR target/99881
> > > > * config/i386/i386.h (processor_costs): Add new member
> > > > integer_to_sse.
> > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > > as sse_op.
> > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > > vec_construct.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/99881
> > > > * gcc.target/i386/pr99881.c: New test.
> > > > ---
> > > > gcc/config/i386/i386.c | 6 ++-
> > > > gcc/config/i386/i386.h | 1 +
> > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> > > > 4 files changed, 81 insertions(+), 1 deletion(-)
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > index ff96134fb37..fbebd2d8f9a 100644
> > > > --- a/gcc/config/i386/i386.c
> > > > +++ b/gcc/config/i386/i386.c
> > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > > > case vec_construct:
> > > > {
> > > > /* N element inserts into SSE vectors. */
> > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > > + int cost
> > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > > + ix86_cost->sse_op
> > > > + : ix86_cost->integer_to_sse);
> > > > +
> > > > /* One vinserti128 for combining two SSE vectors for AVX256. */
> > > > if (GET_MODE_BITSIZE (mode) == 256)
> > > > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > index 0c2c93daf32..d1e1c225990 100644
> > > > --- a/gcc/config/i386/i386.h
> > > > +++ b/gcc/config/i386/i386.h
> > > > @@ -165,6 +165,7 @@ struct processor_costs {
> > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> > > > zmm_move;
> > > > const int sse_to_integer; /* cost of moving SSE register to integer. */
> > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */
> > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed
> > > > as static + per_item * nelts. */
> > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > > index ffe810f2bcb..67cfa006196 100644
> > > > --- a/gcc/config/i386/x86-tune-costs.h
> > > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > > > in 128bit, 256bit and 512bit */
> > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> > > > 5, 0, /* Gather load static, per_elt. */
> > > > 5, 0, /* Gather store static, per_elt. */
> > > > 0, /* size of l1 cache */
> > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 0, /* size of l1 cache */
> > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 4, /* size of l1 cache. 486 has 8kB cache
> > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 8, /* size of l1 cache. */
> > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 8, /* size of l1 cache. */
> > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 8, /* size of l1 cache. */
> > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 2, 2, /* Gather load static, per_elt. */
> > > > 2, 2, /* Gather store static, per_elt. */
> > > > 64, /* size of l1 cache. */
> > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 2, 2, /* Gather load static, per_elt. */
> > > > 2, 2, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 5, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 64, /* size of l1 cache. */
> > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 5, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 64, /* size of l1 cache. */
> > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 3, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > 4, 4, /* Gather load static, per_elt. */
> > > > 4, 4, /* Gather store static, per_elt. */
> > > > 64, /* size of l1 cache. */
> > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 16, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > 12, 12, /* Gather load static, per_elt. */
> > > > 10, 10, /* Gather store static, per_elt. */
> > > > 16, /* size of l1 cache. */
> > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > is 7 uops. */
> > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > register. */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > is 7 uops. */
> > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > register. */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> > > > throughput 9. Approx 7 uops do not depend on vector size and every load
> > > > is 4 uops. */
> > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> > > > 20, 8, /* Gather load static, per_elt. */
> > > > 22, 10, /* Gather store static, per_elt. */
> > > > 64, /* size of l1 cache. */
> > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 20, 8, /* Gather load static, per_elt. */
> > > > 22, 10, /* Gather store static, per_elt. */
> > > > 64, /* size of l1 cache. */
> > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 14, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 10, 10, /* Gather load static, per_elt. */
> > > > 10, 10, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 14, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 10, 10, /* Gather load static, per_elt. */
> > > > 10, 10, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> > > > 20, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > 16, 16, /* Gather load static, per_elt. */
> > > > 16, 16, /* Gather store static, per_elt. */
> > > > 8, /* size of l1 cache. */
> > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> > > > 20, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > 12, 12, /* Gather load static, per_elt. */
> > > > 12, 12, /* Gather store static, per_elt. */
> > > > 8, /* size of l1 cache. */
> > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 8, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 8, 8, /* Gather load static, per_elt. */
> > > > 8, 8, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > 8, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 8, 8, /* Gather load static, per_elt. */
> > > > 8, 8, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> > > > 4, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 6, 6, /* Gather load static, per_elt. */
> > > > 6, 6, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > 6, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > 18, 6, /* Gather load static, per_elt. */
> > > > 18, 6, /* Gather store static, per_elt. */
> > > > 32, /* size of l1 cache. */
> > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > 2, /* cost of moving SSE register to integer. */
> > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > > > rec. throughput 6.
> > > > So 5 uops statically and one uops per load. */
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > new file mode 100644
> > > > index 00000000000..7ae51c8310d
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > @@ -0,0 +1,49 @@
> > > > +/* PR target/99881. */
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-Ofast -march=skylake" } */
> > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> > > > +
> > > > +void
> > > > +foo (int* __restrict a, int n, int c)
> > > > +{
> > > > + a[0] = n;
> > > > + a[1] = c;
> > > > +}
> > > > +
> > > > +void
> > > > +foo1 (int* __restrict a, int n, int b, int c, int d)
> > > > +{
> > > > + a[0] = n;
> > > > + a[1] = b;
> > > > + a[2] = c;
> > > > + a[3] = d;
> > > > +}
> > > > +
> > > > +void
> > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> > > > +{
> > > > + a[0] = n;
> > > > + a[1] = b;
> > > > + a[2] = c;
> > > > + a[3] = d;
> > > > + a[4] = e;
> > > > + a[5] = f;
> > > > + a[6] = g;
> > > > + a[7] = h;
> > > > +}
> > > > +
> > > > +void
> > > > +foo3 (long long* __restrict a, long long n, long long c)
> > > > +{
> > > > + a[0] = n;
> > > > + a[1] = c;
> > > > +}
> > > > +
> > > > +void
> > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> > > > +{
> > > > + a[0] = n;
> > > > + a[1] = b;
> > > > + a[2] = c;
> > > > + a[3] = d;
> > > > +}
> > > > --
> > > > 2.18.1
> > > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
> >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
2021-08-03 9:20 ` Richard Biener
@ 2021-08-03 10:20 ` Richard Biener
2021-08-03 11:12 ` Hongtao Liu
0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2021-08-03 10:20 UTC (permalink / raw)
To: Hongtao Liu; +Cc: liuhongt, GCC Patches
On Tue, Aug 3, 2021 at 11:20 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > Correct mail list, please reply under this email.
> > > >
> > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > Hi:
> > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > > > and CLX, which leads to worse performance in vectorization in some cases.
> > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > > > > which is used by vector construction, the cost is same as sse_op on other
> > > > > targets, but twice much as sse_op on CLX/SKX.
> > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > > > Ok for trunk?
> > > > >
> > > I'm going to check in this patch if there's no objection.
> > Pushed to trunk.
>
> /* N element inserts into SSE vectors. */
> - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> + int cost
> + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> + ix86_cost->sse_op
> + : ix86_cost->integer_to_sse);
> +
>
> so that's costing movd and pinsr the same, shouldn't we try to separate this
> by doing
>
> /* N element inserts into SSE vectors. */
> int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> /* Account for int->SSE reg moves. */
> if (!fp)
> cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse;
>
> ? pinsr is only supported with SSE4+ IIRC. Note we also have
>
> case vec_to_scalar:
> case scalar_to_vec:
> return ix86_vec_cost (mode, ix86_cost->sse_op);
>
> where scalar_to_vec is used to cost splats and vec_to_scalar is used
> to cost element extracts. Both lack costing of the move part.
>
> I realize we have GPR to XMM inserts which cover both the "move" and
> the insert but then calling this 'integer_to_sse' is a bit odd. The extract
> cost also depends on the element number for AVX2/AVX512F. The
> vectorizer usually decomposes a vector fully and never does single
> element extracts so the vextract128 cost amortizes.
>
> That said, the change leaves all targets besides skylake_cost with
> not so great defaults I think. For skylake you effectively add another
> sse_op for the int->SSE move plus '1' (for whatever reason). I think
> that's reasonable for all targets.
>
> It does look a bit odd to have
>
> 8, /* cost of moving SSE register to intege
> r. */
> COSTS_N_INSNS (1), /* cost of moving integer to sse registe
> r. */
>
> where sse_to_integer is used by the STV pass which mixes
> CONST_N_INSNS scaled costs and unscaled costs (ick).
Debugging some eventually related thing I applied the same costing
as you did to skylake_cost to znver2_cost and figured that
538.imagick_r regresses by 23% at -Ofast -march=znver2 by such
change. So it seems changes here indeed need careful benchmarking.
Richard.
> Richard.
>
> > > > > gcc/ChangeLog:
> > > > >
> > > > > PR target/99881
> > > > > * config/i386/i386.h (processor_costs): Add new member
> > > > > integer_to_sse.
> > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > > > as sse_op.
> > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > > > vec_construct.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > PR target/99881
> > > > > * gcc.target/i386/pr99881.c: New test.
> > > > > ---
> > > > > gcc/config/i386/i386.c | 6 ++-
> > > > > gcc/config/i386/i386.h | 1 +
> > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> > > > > 4 files changed, 81 insertions(+), 1 deletion(-)
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > > > >
> > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > index ff96134fb37..fbebd2d8f9a 100644
> > > > > --- a/gcc/config/i386/i386.c
> > > > > +++ b/gcc/config/i386/i386.c
> > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > > > > case vec_construct:
> > > > > {
> > > > > /* N element inserts into SSE vectors. */
> > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > > > + int cost
> > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > > > + ix86_cost->sse_op
> > > > > + : ix86_cost->integer_to_sse);
> > > > > +
> > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */
> > > > > if (GET_MODE_BITSIZE (mode) == 256)
> > > > > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > > index 0c2c93daf32..d1e1c225990 100644
> > > > > --- a/gcc/config/i386/i386.h
> > > > > +++ b/gcc/config/i386/i386.h
> > > > > @@ -165,6 +165,7 @@ struct processor_costs {
> > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> > > > > zmm_move;
> > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */
> > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */
> > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed
> > > > > as static + per_item * nelts. */
> > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > > > index ffe810f2bcb..67cfa006196 100644
> > > > > --- a/gcc/config/i386/x86-tune-costs.h
> > > > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > > > > in 128bit, 256bit and 512bit */
> > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> > > > > 5, 0, /* Gather load static, per_elt. */
> > > > > 5, 0, /* Gather store static, per_elt. */
> > > > > 0, /* size of l1 cache */
> > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 0, /* size of l1 cache */
> > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 4, /* size of l1 cache. 486 has 8kB cache
> > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 8, /* size of l1 cache. */
> > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 8, /* size of l1 cache. */
> > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 8, /* size of l1 cache. */
> > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 2, 2, /* Gather load static, per_elt. */
> > > > > 2, 2, /* Gather store static, per_elt. */
> > > > > 64, /* size of l1 cache. */
> > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 2, 2, /* Gather load static, per_elt. */
> > > > > 2, 2, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 5, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 64, /* size of l1 cache. */
> > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 5, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 64, /* size of l1 cache. */
> > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 3, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > 64, /* size of l1 cache. */
> > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 16, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > 12, 12, /* Gather load static, per_elt. */
> > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > 16, /* size of l1 cache. */
> > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > > is 7 uops. */
> > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > > register. */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > > is 7 uops. */
> > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > > register. */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> > > > > throughput 9. Approx 7 uops do not depend on vector size and every load
> > > > > is 4 uops. */
> > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> > > > > 20, 8, /* Gather load static, per_elt. */
> > > > > 22, 10, /* Gather store static, per_elt. */
> > > > > 64, /* size of l1 cache. */
> > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 20, 8, /* Gather load static, per_elt. */
> > > > > 22, 10, /* Gather store static, per_elt. */
> > > > > 64, /* size of l1 cache. */
> > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 14, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 10, 10, /* Gather load static, per_elt. */
> > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 14, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 10, 10, /* Gather load static, per_elt. */
> > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> > > > > 20, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > 16, 16, /* Gather load static, per_elt. */
> > > > > 16, 16, /* Gather store static, per_elt. */
> > > > > 8, /* size of l1 cache. */
> > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> > > > > 20, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > 12, 12, /* Gather load static, per_elt. */
> > > > > 12, 12, /* Gather store static, per_elt. */
> > > > > 8, /* size of l1 cache. */
> > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 8, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 8, 8, /* Gather load static, per_elt. */
> > > > > 8, 8, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > 8, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 8, 8, /* Gather load static, per_elt. */
> > > > > 8, 8, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> > > > > 4, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 6, 6, /* Gather load static, per_elt. */
> > > > > 6, 6, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > 6, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > 18, 6, /* Gather load static, per_elt. */
> > > > > 18, 6, /* Gather store static, per_elt. */
> > > > > 32, /* size of l1 cache. */
> > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > 2, /* cost of moving SSE register to integer. */
> > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > > > > rec. throughput 6.
> > > > > So 5 uops statically and one uops per load. */
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > new file mode 100644
> > > > > index 00000000000..7ae51c8310d
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > @@ -0,0 +1,49 @@
> > > > > +/* PR target/99881. */
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-Ofast -march=skylake" } */
> > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> > > > > +
> > > > > +void
> > > > > +foo (int* __restrict a, int n, int c)
> > > > > +{
> > > > > + a[0] = n;
> > > > > + a[1] = c;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +foo1 (int* __restrict a, int n, int b, int c, int d)
> > > > > +{
> > > > > + a[0] = n;
> > > > > + a[1] = b;
> > > > > + a[2] = c;
> > > > > + a[3] = d;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> > > > > +{
> > > > > + a[0] = n;
> > > > > + a[1] = b;
> > > > > + a[2] = c;
> > > > > + a[3] = d;
> > > > > + a[4] = e;
> > > > > + a[5] = f;
> > > > > + a[6] = g;
> > > > > + a[7] = h;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +foo3 (long long* __restrict a, long long n, long long c)
> > > > > +{
> > > > > + a[0] = n;
> > > > > + a[1] = c;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> > > > > +{
> > > > > + a[0] = n;
> > > > > + a[1] = b;
> > > > > + a[2] = c;
> > > > > + a[3] = d;
> > > > > +}
> > > > > --
> > > > > 2.18.1
> > > > >
> > > >
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
> >
> >
> >
> > --
> > BR,
> > Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
2021-08-03 10:20 ` Richard Biener
@ 2021-08-03 11:12 ` Hongtao Liu
2021-08-09 2:52 ` Hongtao Liu
0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2021-08-03 11:12 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
On Tue, Aug 3, 2021 at 6:20 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Tue, Aug 3, 2021 at 11:20 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > Correct mail list, please reply under this email.
> > > > >
> > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > >
> > > > > > Hi:
> > > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > > > > and CLX, which leads to worse performance in vectorization in some cases.
> > > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > > > > > which is used by vector construction, the cost is same as sse_op on other
> > > > > > targets, but twice much as sse_op on CLX/SKX.
> > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > > > > Ok for trunk?
> > > > > >
> > > > I'm going to check in this patch if there's no objection.
> > > Pushed to trunk.
> >
> > /* N element inserts into SSE vectors. */
> > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > + int cost
> > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > + ix86_cost->sse_op
> > + : ix86_cost->integer_to_sse);
> > +
> >
> > so that's costing movd and pinsr the same, shouldn't we try to separate this
> > by doing
> >
> > /* N element inserts into SSE vectors. */
> > int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > /* Account for int->SSE reg moves. */
> > if (!fp)
> > cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse;
> >
> > ? pinsr is only supported with SSE4+ IIRC. Note we also have
we have pinsrw under sse2, and pinsrb/d/q w/ sse4+.
integer_to_see is an estimate to model the average overhead of each
integer from gpr to sse, it can be movd + unpck,or movd + pinsr.
It seems reasonable to have uniform costs for scalar_to_vec and integer_to_sse.
vec_to_scalar and sse_to_integer seems to be different,
sse_to_integer corresponds to movd. vec_to_scalar is vec_extract.
Maybe we should rename integer_to_sse to vec_set_integer.
> >
> > case vec_to_scalar:
> > case scalar_to_vec:
> > return ix86_vec_cost (mode, ix86_cost->sse_op);
> >
> > where scalar_to_vec is used to cost splats and vec_to_scalar is used
> > to cost element extracts. Both lack costing of the move part.
> >
> > I realize we have GPR to XMM inserts which cover both the "move" and
> > the insert but then calling this 'integer_to_sse' is a bit odd. The extract
> > cost also depends on the element number for AVX2/AVX512F. The
> > vectorizer usually decomposes a vector fully and never does single
> > element extracts so the vextract128 cost amortizes.
> >
> > That said, the change leaves all targets besides skylake_cost with
> > not so great defaults I think. For skylake you effectively add another
> > sse_op for the int->SSE move plus '1' (for whatever reason). I think
> > that's reasonable for all targets.
> >
> > It does look a bit odd to have
> >
> > 8, /* cost of moving SSE register to intege
> > r. */
> > COSTS_N_INSNS (1), /* cost of moving integer to sse registe
> > r. */
> >
> > where sse_to_integer is used by the STV pass which mixes
> > CONST_N_INSNS scaled costs and unscaled costs (ick).
>
> Debugging some eventually related thing I applied the same costing
> as you did to skylake_cost to znver2_cost and figured that
> 538.imagick_r regresses by 23% at -Ofast -march=znver2 by such
> change. So it seems changes here indeed need careful benchmarking.
>
> Richard.
>
> > Richard.
> >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/99881
> > > > > > * config/i386/i386.h (processor_costs): Add new member
> > > > > > integer_to_sse.
> > > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > > > > as sse_op.
> > > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > > > > vec_construct.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR target/99881
> > > > > > * gcc.target/i386/pr99881.c: New test.
> > > > > > ---
> > > > > > gcc/config/i386/i386.c | 6 ++-
> > > > > > gcc/config/i386/i386.h | 1 +
> > > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> > > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> > > > > > 4 files changed, 81 insertions(+), 1 deletion(-)
> > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > >
> > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > > index ff96134fb37..fbebd2d8f9a 100644
> > > > > > --- a/gcc/config/i386/i386.c
> > > > > > +++ b/gcc/config/i386/i386.c
> > > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > > > > > case vec_construct:
> > > > > > {
> > > > > > /* N element inserts into SSE vectors. */
> > > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > > > > + int cost
> > > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > > > > + ix86_cost->sse_op
> > > > > > + : ix86_cost->integer_to_sse);
> > > > > > +
> > > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */
> > > > > > if (GET_MODE_BITSIZE (mode) == 256)
> > > > > > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > > > index 0c2c93daf32..d1e1c225990 100644
> > > > > > --- a/gcc/config/i386/i386.h
> > > > > > +++ b/gcc/config/i386/i386.h
> > > > > > @@ -165,6 +165,7 @@ struct processor_costs {
> > > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> > > > > > zmm_move;
> > > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */
> > > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */
> > > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed
> > > > > > as static + per_item * nelts. */
> > > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > > > > index ffe810f2bcb..67cfa006196 100644
> > > > > > --- a/gcc/config/i386/x86-tune-costs.h
> > > > > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > > > > > in 128bit, 256bit and 512bit */
> > > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> > > > > > 5, 0, /* Gather load static, per_elt. */
> > > > > > 5, 0, /* Gather store static, per_elt. */
> > > > > > 0, /* size of l1 cache */
> > > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 0, /* size of l1 cache */
> > > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 4, /* size of l1 cache. 486 has 8kB cache
> > > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 8, /* size of l1 cache. */
> > > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 8, /* size of l1 cache. */
> > > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 8, /* size of l1 cache. */
> > > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 2, 2, /* Gather load static, per_elt. */
> > > > > > 2, 2, /* Gather store static, per_elt. */
> > > > > > 64, /* size of l1 cache. */
> > > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 2, 2, /* Gather load static, per_elt. */
> > > > > > 2, 2, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 5, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 64, /* size of l1 cache. */
> > > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 5, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 64, /* size of l1 cache. */
> > > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> > > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > 64, /* size of l1 cache. */
> > > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> > > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 16, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > 12, 12, /* Gather load static, per_elt. */
> > > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > > 16, /* size of l1 cache. */
> > > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> > > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > > > is 7 uops. */
> > > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > > > register. */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > > > is 7 uops. */
> > > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > > > register. */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> > > > > > throughput 9. Approx 7 uops do not depend on vector size and every load
> > > > > > is 4 uops. */
> > > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> > > > > > 20, 8, /* Gather load static, per_elt. */
> > > > > > 22, 10, /* Gather store static, per_elt. */
> > > > > > 64, /* size of l1 cache. */
> > > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 20, 8, /* Gather load static, per_elt. */
> > > > > > 22, 10, /* Gather store static, per_elt. */
> > > > > > 64, /* size of l1 cache. */
> > > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 14, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 10, 10, /* Gather load static, per_elt. */
> > > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 14, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 10, 10, /* Gather load static, per_elt. */
> > > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> > > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 20, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > 16, 16, /* Gather load static, per_elt. */
> > > > > > 16, 16, /* Gather store static, per_elt. */
> > > > > > 8, /* size of l1 cache. */
> > > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> > > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 20, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > 12, 12, /* Gather load static, per_elt. */
> > > > > > 12, 12, /* Gather store static, per_elt. */
> > > > > > 8, /* size of l1 cache. */
> > > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 8, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 8, 8, /* Gather load static, per_elt. */
> > > > > > 8, 8, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 8, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 8, 8, /* Gather load static, per_elt. */
> > > > > > 8, 8, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> > > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 4, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 6, 6, /* Gather load static, per_elt. */
> > > > > > 6, 6, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> > > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> > > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > 18, 6, /* Gather load static, per_elt. */
> > > > > > 18, 6, /* Gather store static, per_elt. */
> > > > > > 32, /* size of l1 cache. */
> > > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> > > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > 2, /* cost of moving SSE register to integer. */
> > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > > > > > rec. throughput 6.
> > > > > > So 5 uops statically and one uops per load. */
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..7ae51c8310d
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > > @@ -0,0 +1,49 @@
> > > > > > +/* PR target/99881. */
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-Ofast -march=skylake" } */
> > > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> > > > > > +
> > > > > > +void
> > > > > > +foo (int* __restrict a, int n, int c)
> > > > > > +{
> > > > > > + a[0] = n;
> > > > > > + a[1] = c;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +foo1 (int* __restrict a, int n, int b, int c, int d)
> > > > > > +{
> > > > > > + a[0] = n;
> > > > > > + a[1] = b;
> > > > > > + a[2] = c;
> > > > > > + a[3] = d;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> > > > > > +{
> > > > > > + a[0] = n;
> > > > > > + a[1] = b;
> > > > > > + a[2] = c;
> > > > > > + a[3] = d;
> > > > > > + a[4] = e;
> > > > > > + a[5] = f;
> > > > > > + a[6] = g;
> > > > > > + a[7] = h;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +foo3 (long long* __restrict a, long long n, long long c)
> > > > > > +{
> > > > > > + a[0] = n;
> > > > > > + a[1] = c;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> > > > > > +{
> > > > > > + a[0] = n;
> > > > > > + a[1] = b;
> > > > > > + a[2] = c;
> > > > > > + a[3] = d;
> > > > > > +}
> > > > > > --
> > > > > > 2.18.1
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > BR,
> > > > > Hongtao
> > > >
> > > >
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
2021-08-03 11:12 ` Hongtao Liu
@ 2021-08-09 2:52 ` Hongtao Liu
0 siblings, 0 replies; 7+ messages in thread
From: Hongtao Liu @ 2021-08-09 2:52 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
On Tue, Aug 3, 2021 at 7:12 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Aug 3, 2021 at 6:20 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Tue, Aug 3, 2021 at 11:20 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > Correct mail list, please reply under this email.
> > > > > >
> > > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > >
> > > > > > > Hi:
> > > > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > > > > > and CLX, which leads to worse performance in vectorization in some cases.
> > > > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > > > > > > which is used by vector construction, the cost is same as sse_op on other
> > > > > > > targets, but twice much as sse_op on CLX/SKX.
> > > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > > > > > Ok for trunk?
> > > > > > >
> > > > > I'm going to check in this patch if there's no objection.
> > > > Pushed to trunk.
> > >
> > > /* N element inserts into SSE vectors. */
> > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > + int cost
> > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > + ix86_cost->sse_op
> > > + : ix86_cost->integer_to_sse);
> > > +
> > >
> > > so that's costing movd and pinsr the same, shouldn't we try to separate this
> > > by doing
> > >
> > > /* N element inserts into SSE vectors. */
> > > int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > /* Account for int->SSE reg moves. */
> > > if (!fp)
> > > cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse;
> > >
> > > ? pinsr is only supported with SSE4+ IIRC. Note we also have
> we have pinsrw under sse2, and pinsrb/d/q w/ sse4+.
> integer_to_see is an estimate to model the average overhead of each
> integer from gpr to sse, it can be movd + unpck,or movd + pinsr.
> It seems reasonable to have uniform costs for scalar_to_vec and integer_to_sse.
> vec_to_scalar and sse_to_integer seems to be different,
> sse_to_integer corresponds to movd. vec_to_scalar is vec_extract.
> Maybe we should rename integer_to_sse to vec_set_integer.
> > >
> > > case vec_to_scalar:
> > > case scalar_to_vec:
> > > return ix86_vec_cost (mode, ix86_cost->sse_op);
> > >
> > > where scalar_to_vec is used to cost splats and vec_to_scalar is used
> > > to cost element extracts. Both lack costing of the move part.
> > >
> > > I realize we have GPR to XMM inserts which cover both the "move" and
> > > the insert but then calling this 'integer_to_sse' is a bit odd. The extract
> > > cost also depends on the element number for AVX2/AVX512F. The
> > > vectorizer usually decomposes a vector fully and never does single
> > > element extracts so the vextract128 cost amortizes.
> > >
> > > That said, the change leaves all targets besides skylake_cost with
> > > not so great defaults I think. For skylake you effectively add another
> > > sse_op for the int->SSE move plus '1' (for whatever reason). I think
> > > that's reasonable for all targets.
> > >
> > > It does look a bit odd to have
> > >
> > > 8, /* cost of moving SSE register to intege
> > > r. */
> > > COSTS_N_INSNS (1), /* cost of moving integer to sse registe
> > > r. */
> > >
> > > where sse_to_integer is used by the STV pass which mixes
> > > CONST_N_INSNS scaled costs and unscaled costs (ick).
> >
> > Debugging some eventually related thing I applied the same costing
> > as you did to skylake_cost to znver2_cost and figured that
> > 538.imagick_r regresses by 23% at -Ofast -march=znver2 by such
Guess it's related to store forward stall, just like PR100076.
> > change. So it seems changes here indeed need careful benchmarking.
> >
> > Richard.
> >
> > > Richard.
> > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > > PR target/99881
> > > > > > > * config/i386/i386.h (processor_costs): Add new member
> > > > > > > integer_to_sse.
> > > > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > > > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > > > > > as sse_op.
> > > > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > > > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > > > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > > > > > vec_construct.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > > PR target/99881
> > > > > > > * gcc.target/i386/pr99881.c: New test.
> > > > > > > ---
> > > > > > > gcc/config/i386/i386.c | 6 ++-
> > > > > > > gcc/config/i386/i386.h | 1 +
> > > > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++
> > > > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++
> > > > > > > 4 files changed, 81 insertions(+), 1 deletion(-)
> > > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > > >
> > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > > > index ff96134fb37..fbebd2d8f9a 100644
> > > > > > > --- a/gcc/config/i386/i386.c
> > > > > > > +++ b/gcc/config/i386/i386.c
> > > > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> > > > > > > case vec_construct:
> > > > > > > {
> > > > > > > /* N element inserts into SSE vectors. */
> > > > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > > > > > + int cost
> > > > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > > > > > + ix86_cost->sse_op
> > > > > > > + : ix86_cost->integer_to_sse);
> > > > > > > +
> > > > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */
> > > > > > > if (GET_MODE_BITSIZE (mode) == 256)
> > > > > > > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > > > > index 0c2c93daf32..d1e1c225990 100644
> > > > > > > --- a/gcc/config/i386/i386.h
> > > > > > > +++ b/gcc/config/i386/i386.h
> > > > > > > @@ -165,6 +165,7 @@ struct processor_costs {
> > > > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */
> > > > > > > zmm_move;
> > > > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */
> > > > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */
> > > > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed
> > > > > > > as static + per_item * nelts. */
> > > > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > > > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > > > > > index ffe810f2bcb..67cfa006196 100644
> > > > > > > --- a/gcc/config/i386/x86-tune-costs.h
> > > > > > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > > > > > > in 128bit, 256bit and 512bit */
> > > > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */
> > > > > > > 5, 0, /* Gather load static, per_elt. */
> > > > > > > 5, 0, /* Gather store static, per_elt. */
> > > > > > > 0, /* size of l1 cache */
> > > > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 0, /* size of l1 cache */
> > > > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 4, /* size of l1 cache. 486 has 8kB cache
> > > > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
> > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 8, /* size of l1 cache. */
> > > > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
> > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 8, /* size of l1 cache. */
> > > > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
> > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 8, /* size of l1 cache. */
> > > > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
> > > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 2, 2, /* Gather load static, per_elt. */
> > > > > > > 2, 2, /* Gather store static, per_elt. */
> > > > > > > 64, /* size of l1 cache. */
> > > > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
> > > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 2, 2, /* Gather load static, per_elt. */
> > > > > > > 2, 2, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
> > > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 5, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 64, /* size of l1 cache. */
> > > > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
> > > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 5, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 64, /* size of l1 cache. */
> > > > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
> > > > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 3, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > > 4, 4, /* Gather load static, per_elt. */
> > > > > > > 4, 4, /* Gather store static, per_elt. */
> > > > > > > 64, /* size of l1 cache. */
> > > > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
> > > > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 16, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > > 12, 12, /* Gather load static, per_elt. */
> > > > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > > > 16, /* size of l1 cache. */
> > > > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
> > > > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
> > > > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > > > > is 7 uops. */
> > > > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
> > > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > > > > register. */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load
> > > > > > > is 7 uops. */
> > > > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
> > > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM
> > > > > > > register. */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> > > > > > > throughput 9. Approx 7 uops do not depend on vector size and every load
> > > > > > > is 4 uops. */
> > > > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
> > > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */
> > > > > > > 20, 8, /* Gather load static, per_elt. */
> > > > > > > 22, 10, /* Gather store static, per_elt. */
> > > > > > > 64, /* size of l1 cache. */
> > > > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
> > > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 20, 8, /* Gather load static, per_elt. */
> > > > > > > 22, 10, /* Gather store static, per_elt. */
> > > > > > > 64, /* size of l1 cache. */
> > > > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
> > > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 14, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 10, 10, /* Gather load static, per_elt. */
> > > > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
> > > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 14, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 10, 10, /* Gather load static, per_elt. */
> > > > > > > 10, 10, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
> > > > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > > > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 20, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > > 16, 16, /* Gather load static, per_elt. */
> > > > > > > 16, 16, /* Gather store static, per_elt. */
> > > > > > > 8, /* size of l1 cache. */
> > > > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
> > > > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > > > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 20, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */
> > > > > > > 12, 12, /* Gather load static, per_elt. */
> > > > > > > 12, 12, /* Gather store static, per_elt. */
> > > > > > > 8, /* size of l1 cache. */
> > > > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
> > > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 8, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 8, 8, /* Gather load static, per_elt. */
> > > > > > > 8, 8, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
> > > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 8, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 8, 8, /* Gather load static, per_elt. */
> > > > > > > 8, 8, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
> > > > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > > > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 4, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 6, 6, /* Gather load static, per_elt. */
> > > > > > > 6, 6, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
> > > > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> > > > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 6, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > 18, 6, /* Gather load static, per_elt. */
> > > > > > > 18, 6, /* Gather store static, per_elt. */
> > > > > > > 32, /* size of l1 cache. */
> > > > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
> > > > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
> > > > > > > 2, /* cost of moving SSE register to integer. */
> > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */
> > > > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > > > > > > rec. throughput 6.
> > > > > > > So 5 uops statically and one uops per load. */
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..7ae51c8310d
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c
> > > > > > > @@ -0,0 +1,49 @@
> > > > > > > +/* PR target/99881. */
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-Ofast -march=skylake" } */
> > > > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
> > > > > > > +
> > > > > > > +void
> > > > > > > +foo (int* __restrict a, int n, int c)
> > > > > > > +{
> > > > > > > + a[0] = n;
> > > > > > > + a[1] = c;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +foo1 (int* __restrict a, int n, int b, int c, int d)
> > > > > > > +{
> > > > > > > + a[0] = n;
> > > > > > > + a[1] = b;
> > > > > > > + a[2] = c;
> > > > > > > + a[3] = d;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
> > > > > > > +{
> > > > > > > + a[0] = n;
> > > > > > > + a[1] = b;
> > > > > > > + a[2] = c;
> > > > > > > + a[3] = d;
> > > > > > > + a[4] = e;
> > > > > > > + a[5] = f;
> > > > > > > + a[6] = g;
> > > > > > > + a[7] = h;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +foo3 (long long* __restrict a, long long n, long long c)
> > > > > > > +{
> > > > > > > + a[0] = n;
> > > > > > > + a[1] = c;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
> > > > > > > +{
> > > > > > > + a[0] = n;
> > > > > > > + a[1] = b;
> > > > > > > + a[2] = c;
> > > > > > > + a[3] = d;
> > > > > > > +}
> > > > > > > --
> > > > > > > 2.18.1
> > > > > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > BR,
> > > > > > Hongtao
> > > > >
> > > > >
> > > > >
> > > > > --
> > > > > BR,
> > > > > Hongtao
> > > >
> > > >
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
>
>
>
> --
> BR,
> Hongtao
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-08-09 2:46 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20210726084723.49443-1-hongtao.liu@intel.com>
2021-07-26 8:49 ` [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct Hongtao Liu
2021-07-27 1:54 ` Hongtao Liu
2021-07-28 2:54 ` Hongtao Liu
2021-08-03 9:20 ` Richard Biener
2021-08-03 10:20 ` Richard Biener
2021-08-03 11:12 ` Hongtao Liu
2021-08-09 2:52 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).