* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. [not found] <20210726084723.49443-1-hongtao.liu@intel.com> @ 2021-07-26 8:49 ` Hongtao Liu 2021-07-27 1:54 ` Hongtao Liu 0 siblings, 1 reply; 7+ messages in thread From: Hongtao Liu @ 2021-07-26 8:49 UTC (permalink / raw) To: liuhongt; +Cc: Uros Bizjak, H. J. Lu, GCC Patches Correct mail list, please reply under this email. On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > Hi: > As decribled in PR, the pinsr instruction has poor throughput in SKX > and CLX, which leads to worse performance in vectorization in some cases. > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > which is used by vector construction, the cost is same as sse_op on other > targets, but twice much as sse_op on CLX/SKX. > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > PR target/99881 > * config/i386/i386.h (processor_costs): Add new member > integer_to_sse. > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > generic_cost, core_cost): Initialize integer_to_sse same value > as sse_op. > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > * config/i386/i386.c (ix86_builtin_vectorization_cost): > Use integer_to_sse instead of sse_op to calculate the cost of > vec_construct. > > gcc/testsuite/ChangeLog: > > PR target/99881 > * gcc.target/i386/pr99881.c: New test. > --- > gcc/config/i386/i386.c | 6 ++- > gcc/config/i386/i386.h | 1 + > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > 4 files changed, 81 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index ff96134fb37..fbebd2d8f9a 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > case vec_construct: > { > /* N element inserts into SSE vectors. */ > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > + int cost > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > + ix86_cost->sse_op > + : ix86_cost->integer_to_sse); > + > /* One vinserti128 for combining two SSE vectors for AVX256. */ > if (GET_MODE_BITSIZE (mode) == 256) > cost += ix86_vec_cost (mode, ix86_cost->addss); > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 0c2c93daf32..d1e1c225990 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -165,6 +165,7 @@ struct processor_costs { > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > zmm_move; > const int sse_to_integer; /* cost of moving SSE register to integer. */ > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > const int gather_static, gather_per_elt; /* Cost of gather load is computed > as static + per_item * nelts. */ > const int scatter_static, scatter_per_elt; /* Cost of gather store is > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > index ffe810f2bcb..67cfa006196 100644 > --- a/gcc/config/i386/x86-tune-costs.h > +++ b/gcc/config/i386/x86-tune-costs.h > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > in 128bit, 256bit and 512bit */ > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > 5, 0, /* Gather load static, per_elt. */ > 5, 0, /* Gather store static, per_elt. */ > 0, /* size of l1 cache */ > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 0, /* size of l1 cache */ > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 4, /* size of l1 cache. 486 has 8kB cache > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 2, 2, /* Gather load static, per_elt. */ > 2, 2, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 2, 2, /* Gather load static, per_elt. */ > 2, 2, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 5, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 5, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 3, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > 4, 4, /* Gather load static, per_elt. */ > 4, 4, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 16, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > 12, 12, /* Gather load static, per_elt. */ > 10, 10, /* Gather store static, per_elt. */ > 16, /* size of l1 cache. */ > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > throughput 12. Approx 9 uops do not depend on vector size and every load > is 7 uops. */ > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > register. */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > throughput 12. Approx 9 uops do not depend on vector size and every load > is 7 uops. */ > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > register. */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > throughput 9. Approx 7 uops do not depend on vector size and every load > is 4 uops. */ > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > 20, 8, /* Gather load static, per_elt. */ > 22, 10, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 20, 8, /* Gather load static, per_elt. */ > 22, 10, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 14, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 10, 10, /* Gather load static, per_elt. */ > 10, 10, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 14, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 10, 10, /* Gather load static, per_elt. */ > 10, 10, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > 20, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > 16, 16, /* Gather load static, per_elt. */ > 16, 16, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > 20, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > 12, 12, /* Gather load static, per_elt. */ > 12, 12, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 8, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 8, 8, /* Gather load static, per_elt. */ > 8, 8, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > 8, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 8, 8, /* Gather load static, per_elt. */ > 8, 8, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > 4, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 6, 6, /* Gather load static, per_elt. */ > 6, 6, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > 6, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > 18, 6, /* Gather load static, per_elt. */ > 18, 6, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > 2, /* cost of moving SSE register to integer. */ > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > rec. throughput 6. > So 5 uops statically and one uops per load. */ > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > new file mode 100644 > index 00000000000..7ae51c8310d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > @@ -0,0 +1,49 @@ > +/* PR target/99881. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -march=skylake" } */ > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > + > +void > +foo (int* __restrict a, int n, int c) > +{ > + a[0] = n; > + a[1] = c; > +} > + > +void > +foo1 (int* __restrict a, int n, int b, int c, int d) > +{ > + a[0] = n; > + a[1] = b; > + a[2] = c; > + a[3] = d; > +} > + > +void > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > +{ > + a[0] = n; > + a[1] = b; > + a[2] = c; > + a[3] = d; > + a[4] = e; > + a[5] = f; > + a[6] = g; > + a[7] = h; > +} > + > +void > +foo3 (long long* __restrict a, long long n, long long c) > +{ > + a[0] = n; > + a[1] = c; > +} > + > +void > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > +{ > + a[0] = n; > + a[1] = b; > + a[2] = c; > + a[3] = d; > +} > -- > 2.18.1 > -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. 2021-07-26 8:49 ` [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct Hongtao Liu @ 2021-07-27 1:54 ` Hongtao Liu 2021-07-28 2:54 ` Hongtao Liu 0 siblings, 1 reply; 7+ messages in thread From: Hongtao Liu @ 2021-07-27 1:54 UTC (permalink / raw) To: liuhongt; +Cc: Uros Bizjak, H. J. Lu, GCC Patches On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote: > > Correct mail list, please reply under this email. > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > > > Hi: > > As decribled in PR, the pinsr instruction has poor throughput in SKX > > and CLX, which leads to worse performance in vectorization in some cases. > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > > which is used by vector construction, the cost is same as sse_op on other > > targets, but twice much as sse_op on CLX/SKX. > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > Ok for trunk? > > I'm going to check in this patch if there's no objection. > > gcc/ChangeLog: > > > > PR target/99881 > > * config/i386/i386.h (processor_costs): Add new member > > integer_to_sse. > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > > generic_cost, core_cost): Initialize integer_to_sse same value > > as sse_op. > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > > * config/i386/i386.c (ix86_builtin_vectorization_cost): > > Use integer_to_sse instead of sse_op to calculate the cost of > > vec_construct. > > > > gcc/testsuite/ChangeLog: > > > > PR target/99881 > > * gcc.target/i386/pr99881.c: New test. > > --- > > gcc/config/i386/i386.c | 6 ++- > > gcc/config/i386/i386.h | 1 + > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > > 4 files changed, 81 insertions(+), 1 deletion(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > index ff96134fb37..fbebd2d8f9a 100644 > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > case vec_construct: > > { > > /* N element inserts into SSE vectors. */ > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > + int cost > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > + ix86_cost->sse_op > > + : ix86_cost->integer_to_sse); > > + > > /* One vinserti128 for combining two SSE vectors for AVX256. */ > > if (GET_MODE_BITSIZE (mode) == 256) > > cost += ix86_vec_cost (mode, ix86_cost->addss); > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 0c2c93daf32..d1e1c225990 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -165,6 +165,7 @@ struct processor_costs { > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > > zmm_move; > > const int sse_to_integer; /* cost of moving SSE register to integer. */ > > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > > const int gather_static, gather_per_elt; /* Cost of gather load is computed > > as static + per_item * nelts. */ > > const int scatter_static, scatter_per_elt; /* Cost of gather store is > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > > index ffe810f2bcb..67cfa006196 100644 > > --- a/gcc/config/i386/x86-tune-costs.h > > +++ b/gcc/config/i386/x86-tune-costs.h > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > > in 128bit, 256bit and 512bit */ > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > > 5, 0, /* Gather load static, per_elt. */ > > 5, 0, /* Gather store static, per_elt. */ > > 0, /* size of l1 cache */ > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 0, /* size of l1 cache */ > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 4, /* size of l1 cache. 486 has 8kB cache > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 2, 2, /* Gather load static, per_elt. */ > > 2, 2, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 2, 2, /* Gather load static, per_elt. */ > > 2, 2, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 5, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 5, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 3, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > 4, 4, /* Gather load static, per_elt. */ > > 4, 4, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 16, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > 12, 12, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > 16, /* size of l1 cache. */ > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > throughput 12. Approx 9 uops do not depend on vector size and every load > > is 7 uops. */ > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > register. */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > throughput 12. Approx 9 uops do not depend on vector size and every load > > is 7 uops. */ > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > register. */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > > throughput 9. Approx 7 uops do not depend on vector size and every load > > is 4 uops. */ > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > > 20, 8, /* Gather load static, per_elt. */ > > 22, 10, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 20, 8, /* Gather load static, per_elt. */ > > 22, 10, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 14, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 10, 10, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 14, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 10, 10, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > > 20, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > 16, 16, /* Gather load static, per_elt. */ > > 16, 16, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > > 20, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > 12, 12, /* Gather load static, per_elt. */ > > 12, 12, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 8, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 8, 8, /* Gather load static, per_elt. */ > > 8, 8, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > 8, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 8, 8, /* Gather load static, per_elt. */ > > 8, 8, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > > 4, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 6, 6, /* Gather load static, per_elt. */ > > 6, 6, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > > 6, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > 18, 6, /* Gather load static, per_elt. */ > > 18, 6, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > 2, /* cost of moving SSE register to integer. */ > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > rec. throughput 6. > > So 5 uops statically and one uops per load. */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > > new file mode 100644 > > index 00000000000..7ae51c8310d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > > @@ -0,0 +1,49 @@ > > +/* PR target/99881. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -march=skylake" } */ > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > > + > > +void > > +foo (int* __restrict a, int n, int c) > > +{ > > + a[0] = n; > > + a[1] = c; > > +} > > + > > +void > > +foo1 (int* __restrict a, int n, int b, int c, int d) > > +{ > > + a[0] = n; > > + a[1] = b; > > + a[2] = c; > > + a[3] = d; > > +} > > + > > +void > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > > +{ > > + a[0] = n; > > + a[1] = b; > > + a[2] = c; > > + a[3] = d; > > + a[4] = e; > > + a[5] = f; > > + a[6] = g; > > + a[7] = h; > > +} > > + > > +void > > +foo3 (long long* __restrict a, long long n, long long c) > > +{ > > + a[0] = n; > > + a[1] = c; > > +} > > + > > +void > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > > +{ > > + a[0] = n; > > + a[1] = b; > > + a[2] = c; > > + a[3] = d; > > +} > > -- > > 2.18.1 > > > > > -- > BR, > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. 2021-07-27 1:54 ` Hongtao Liu @ 2021-07-28 2:54 ` Hongtao Liu 2021-08-03 9:20 ` Richard Biener 0 siblings, 1 reply; 7+ messages in thread From: Hongtao Liu @ 2021-07-28 2:54 UTC (permalink / raw) To: liuhongt; +Cc: Uros Bizjak, H. J. Lu, GCC Patches On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote: > > > > Correct mail list, please reply under this email. > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > Hi: > > > As decribled in PR, the pinsr instruction has poor throughput in SKX > > > and CLX, which leads to worse performance in vectorization in some cases. > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > > > which is used by vector construction, the cost is same as sse_op on other > > > targets, but twice much as sse_op on CLX/SKX. > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > > Ok for trunk? > > > > I'm going to check in this patch if there's no objection. Pushed to trunk. > > > gcc/ChangeLog: > > > > > > PR target/99881 > > > * config/i386/i386.h (processor_costs): Add new member > > > integer_to_sse. > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > > > generic_cost, core_cost): Initialize integer_to_sse same value > > > as sse_op. > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > > > * config/i386/i386.c (ix86_builtin_vectorization_cost): > > > Use integer_to_sse instead of sse_op to calculate the cost of > > > vec_construct. > > > > > > gcc/testsuite/ChangeLog: > > > > > > PR target/99881 > > > * gcc.target/i386/pr99881.c: New test. > > > --- > > > gcc/config/i386/i386.c | 6 ++- > > > gcc/config/i386/i386.h | 1 + > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > > > 4 files changed, 81 insertions(+), 1 deletion(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > index ff96134fb37..fbebd2d8f9a 100644 > > > --- a/gcc/config/i386/i386.c > > > +++ b/gcc/config/i386/i386.c > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > > case vec_construct: > > > { > > > /* N element inserts into SSE vectors. */ > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > + int cost > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > > + ix86_cost->sse_op > > > + : ix86_cost->integer_to_sse); > > > + > > > /* One vinserti128 for combining two SSE vectors for AVX256. */ > > > if (GET_MODE_BITSIZE (mode) == 256) > > > cost += ix86_vec_cost (mode, ix86_cost->addss); > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > index 0c2c93daf32..d1e1c225990 100644 > > > --- a/gcc/config/i386/i386.h > > > +++ b/gcc/config/i386/i386.h > > > @@ -165,6 +165,7 @@ struct processor_costs { > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > > > zmm_move; > > > const int sse_to_integer; /* cost of moving SSE register to integer. */ > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed > > > as static + per_item * nelts. */ > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > > > index ffe810f2bcb..67cfa006196 100644 > > > --- a/gcc/config/i386/x86-tune-costs.h > > > +++ b/gcc/config/i386/x86-tune-costs.h > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > > > in 128bit, 256bit and 512bit */ > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > > > 5, 0, /* Gather load static, per_elt. */ > > > 5, 0, /* Gather store static, per_elt. */ > > > 0, /* size of l1 cache */ > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 0, /* size of l1 cache */ > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 4, /* size of l1 cache. 486 has 8kB cache > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 8, /* size of l1 cache. */ > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 8, /* size of l1 cache. */ > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 8, /* size of l1 cache. */ > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 2, 2, /* Gather load static, per_elt. */ > > > 2, 2, /* Gather store static, per_elt. */ > > > 64, /* size of l1 cache. */ > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 2, 2, /* Gather load static, per_elt. */ > > > 2, 2, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 5, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 64, /* size of l1 cache. */ > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 5, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 64, /* size of l1 cache. */ > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 3, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > 4, 4, /* Gather load static, per_elt. */ > > > 4, 4, /* Gather store static, per_elt. */ > > > 64, /* size of l1 cache. */ > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 16, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > 12, 12, /* Gather load static, per_elt. */ > > > 10, 10, /* Gather store static, per_elt. */ > > > 16, /* size of l1 cache. */ > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > is 7 uops. */ > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > register. */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > is 7 uops. */ > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > register. */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > > > throughput 9. Approx 7 uops do not depend on vector size and every load > > > is 4 uops. */ > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > > > 20, 8, /* Gather load static, per_elt. */ > > > 22, 10, /* Gather store static, per_elt. */ > > > 64, /* size of l1 cache. */ > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 20, 8, /* Gather load static, per_elt. */ > > > 22, 10, /* Gather store static, per_elt. */ > > > 64, /* size of l1 cache. */ > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 14, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 10, 10, /* Gather load static, per_elt. */ > > > 10, 10, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 14, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 10, 10, /* Gather load static, per_elt. */ > > > 10, 10, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > > > 20, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > 16, 16, /* Gather load static, per_elt. */ > > > 16, 16, /* Gather store static, per_elt. */ > > > 8, /* size of l1 cache. */ > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > > > 20, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > 12, 12, /* Gather load static, per_elt. */ > > > 12, 12, /* Gather store static, per_elt. */ > > > 8, /* size of l1 cache. */ > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 8, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 8, 8, /* Gather load static, per_elt. */ > > > 8, 8, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > 8, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 8, 8, /* Gather load static, per_elt. */ > > > 8, 8, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > > > 4, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 6, 6, /* Gather load static, per_elt. */ > > > 6, 6, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > > > 6, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > 18, 6, /* Gather load static, per_elt. */ > > > 18, 6, /* Gather store static, per_elt. */ > > > 32, /* size of l1 cache. */ > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > 2, /* cost of moving SSE register to integer. */ > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > > rec. throughput 6. > > > So 5 uops statically and one uops per load. */ > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > > > new file mode 100644 > > > index 00000000000..7ae51c8310d > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > > > @@ -0,0 +1,49 @@ > > > +/* PR target/99881. */ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-Ofast -march=skylake" } */ > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > > > + > > > +void > > > +foo (int* __restrict a, int n, int c) > > > +{ > > > + a[0] = n; > > > + a[1] = c; > > > +} > > > + > > > +void > > > +foo1 (int* __restrict a, int n, int b, int c, int d) > > > +{ > > > + a[0] = n; > > > + a[1] = b; > > > + a[2] = c; > > > + a[3] = d; > > > +} > > > + > > > +void > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > > > +{ > > > + a[0] = n; > > > + a[1] = b; > > > + a[2] = c; > > > + a[3] = d; > > > + a[4] = e; > > > + a[5] = f; > > > + a[6] = g; > > > + a[7] = h; > > > +} > > > + > > > +void > > > +foo3 (long long* __restrict a, long long n, long long c) > > > +{ > > > + a[0] = n; > > > + a[1] = c; > > > +} > > > + > > > +void > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > > > +{ > > > + a[0] = n; > > > + a[1] = b; > > > + a[2] = c; > > > + a[3] = d; > > > +} > > > -- > > > 2.18.1 > > > > > > > > > -- > > BR, > > Hongtao > > > > -- > BR, > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. 2021-07-28 2:54 ` Hongtao Liu @ 2021-08-03 9:20 ` Richard Biener 2021-08-03 10:20 ` Richard Biener 0 siblings, 1 reply; 7+ messages in thread From: Richard Biener @ 2021-08-03 9:20 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, GCC Patches On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > Correct mail list, please reply under this email. > > > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > Hi: > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX > > > > and CLX, which leads to worse performance in vectorization in some cases. > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > > > > which is used by vector construction, the cost is same as sse_op on other > > > > targets, but twice much as sse_op on CLX/SKX. > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > > > Ok for trunk? > > > > > > I'm going to check in this patch if there's no objection. > Pushed to trunk. /* N element inserts into SSE vectors. */ - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; + int cost + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? + ix86_cost->sse_op + : ix86_cost->integer_to_sse); + so that's costing movd and pinsr the same, shouldn't we try to separate this by doing /* N element inserts into SSE vectors. */ int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; /* Account for int->SSE reg moves. */ if (!fp) cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse; ? pinsr is only supported with SSE4+ IIRC. Note we also have case vec_to_scalar: case scalar_to_vec: return ix86_vec_cost (mode, ix86_cost->sse_op); where scalar_to_vec is used to cost splats and vec_to_scalar is used to cost element extracts. Both lack costing of the move part. I realize we have GPR to XMM inserts which cover both the "move" and the insert but then calling this 'integer_to_sse' is a bit odd. The extract cost also depends on the element number for AVX2/AVX512F. The vectorizer usually decomposes a vector fully and never does single element extracts so the vextract128 cost amortizes. That said, the change leaves all targets besides skylake_cost with not so great defaults I think. For skylake you effectively add another sse_op for the int->SSE move plus '1' (for whatever reason). I think that's reasonable for all targets. It does look a bit odd to have 8, /* cost of moving SSE register to intege r. */ COSTS_N_INSNS (1), /* cost of moving integer to sse registe r. */ where sse_to_integer is used by the STV pass which mixes CONST_N_INSNS scaled costs and unscaled costs (ick). Richard. > > > > gcc/ChangeLog: > > > > > > > > PR target/99881 > > > > * config/i386/i386.h (processor_costs): Add new member > > > > integer_to_sse. > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > > > > generic_cost, core_cost): Initialize integer_to_sse same value > > > > as sse_op. > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost): > > > > Use integer_to_sse instead of sse_op to calculate the cost of > > > > vec_construct. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR target/99881 > > > > * gcc.target/i386/pr99881.c: New test. > > > > --- > > > > gcc/config/i386/i386.c | 6 ++- > > > > gcc/config/i386/i386.h | 1 + > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > > > > 4 files changed, 81 insertions(+), 1 deletion(-) > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > > index ff96134fb37..fbebd2d8f9a 100644 > > > > --- a/gcc/config/i386/i386.c > > > > +++ b/gcc/config/i386/i386.c > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > > > case vec_construct: > > > > { > > > > /* N element inserts into SSE vectors. */ > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > > + int cost > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > > > + ix86_cost->sse_op > > > > + : ix86_cost->integer_to_sse); > > > > + > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */ > > > > if (GET_MODE_BITSIZE (mode) == 256) > > > > cost += ix86_vec_cost (mode, ix86_cost->addss); > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > > index 0c2c93daf32..d1e1c225990 100644 > > > > --- a/gcc/config/i386/i386.h > > > > +++ b/gcc/config/i386/i386.h > > > > @@ -165,6 +165,7 @@ struct processor_costs { > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > > > > zmm_move; > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */ > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed > > > > as static + per_item * nelts. */ > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > > > > index ffe810f2bcb..67cfa006196 100644 > > > > --- a/gcc/config/i386/x86-tune-costs.h > > > > +++ b/gcc/config/i386/x86-tune-costs.h > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > > > > in 128bit, 256bit and 512bit */ > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > > > > 5, 0, /* Gather load static, per_elt. */ > > > > 5, 0, /* Gather store static, per_elt. */ > > > > 0, /* size of l1 cache */ > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 0, /* size of l1 cache */ > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 4, /* size of l1 cache. 486 has 8kB cache > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 8, /* size of l1 cache. */ > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 8, /* size of l1 cache. */ > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 8, /* size of l1 cache. */ > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 2, 2, /* Gather load static, per_elt. */ > > > > 2, 2, /* Gather store static, per_elt. */ > > > > 64, /* size of l1 cache. */ > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 2, 2, /* Gather load static, per_elt. */ > > > > 2, 2, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 5, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 64, /* size of l1 cache. */ > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 5, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 64, /* size of l1 cache. */ > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 3, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > 4, 4, /* Gather load static, per_elt. */ > > > > 4, 4, /* Gather store static, per_elt. */ > > > > 64, /* size of l1 cache. */ > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 16, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > 12, 12, /* Gather load static, per_elt. */ > > > > 10, 10, /* Gather store static, per_elt. */ > > > > 16, /* size of l1 cache. */ > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > is 7 uops. */ > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > register. */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > is 7 uops. */ > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > register. */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > > > > throughput 9. Approx 7 uops do not depend on vector size and every load > > > > is 4 uops. */ > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > > > > 20, 8, /* Gather load static, per_elt. */ > > > > 22, 10, /* Gather store static, per_elt. */ > > > > 64, /* size of l1 cache. */ > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 20, 8, /* Gather load static, per_elt. */ > > > > 22, 10, /* Gather store static, per_elt. */ > > > > 64, /* size of l1 cache. */ > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 14, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 10, 10, /* Gather load static, per_elt. */ > > > > 10, 10, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 14, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 10, 10, /* Gather load static, per_elt. */ > > > > 10, 10, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > > > > 20, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > 16, 16, /* Gather load static, per_elt. */ > > > > 16, 16, /* Gather store static, per_elt. */ > > > > 8, /* size of l1 cache. */ > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > > > > 20, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > 12, 12, /* Gather load static, per_elt. */ > > > > 12, 12, /* Gather store static, per_elt. */ > > > > 8, /* size of l1 cache. */ > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 8, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 8, 8, /* Gather load static, per_elt. */ > > > > 8, 8, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > 8, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 8, 8, /* Gather load static, per_elt. */ > > > > 8, 8, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > > > > 4, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 6, 6, /* Gather load static, per_elt. */ > > > > 6, 6, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > 6, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > 18, 6, /* Gather load static, per_elt. */ > > > > 18, 6, /* Gather store static, per_elt. */ > > > > 32, /* size of l1 cache. */ > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > 2, /* cost of moving SSE register to integer. */ > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > > > rec. throughput 6. > > > > So 5 uops statically and one uops per load. */ > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > new file mode 100644 > > > > index 00000000000..7ae51c8310d > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > @@ -0,0 +1,49 @@ > > > > +/* PR target/99881. */ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-Ofast -march=skylake" } */ > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > > > > + > > > > +void > > > > +foo (int* __restrict a, int n, int c) > > > > +{ > > > > + a[0] = n; > > > > + a[1] = c; > > > > +} > > > > + > > > > +void > > > > +foo1 (int* __restrict a, int n, int b, int c, int d) > > > > +{ > > > > + a[0] = n; > > > > + a[1] = b; > > > > + a[2] = c; > > > > + a[3] = d; > > > > +} > > > > + > > > > +void > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > > > > +{ > > > > + a[0] = n; > > > > + a[1] = b; > > > > + a[2] = c; > > > > + a[3] = d; > > > > + a[4] = e; > > > > + a[5] = f; > > > > + a[6] = g; > > > > + a[7] = h; > > > > +} > > > > + > > > > +void > > > > +foo3 (long long* __restrict a, long long n, long long c) > > > > +{ > > > > + a[0] = n; > > > > + a[1] = c; > > > > +} > > > > + > > > > +void > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > > > > +{ > > > > + a[0] = n; > > > > + a[1] = b; > > > > + a[2] = c; > > > > + a[3] = d; > > > > +} > > > > -- > > > > 2.18.1 > > > > > > > > > > > > > -- > > > BR, > > > Hongtao > > > > > > > > -- > > BR, > > Hongtao > > > > -- > BR, > Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. 2021-08-03 9:20 ` Richard Biener @ 2021-08-03 10:20 ` Richard Biener 2021-08-03 11:12 ` Hongtao Liu 0 siblings, 1 reply; 7+ messages in thread From: Richard Biener @ 2021-08-03 10:20 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, GCC Patches On Tue, Aug 3, 2021 at 11:20 AM Richard Biener <richard.guenther@gmail.com> wrote: > > On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > Correct mail list, please reply under this email. > > > > > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > Hi: > > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX > > > > > and CLX, which leads to worse performance in vectorization in some cases. > > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > > > > > which is used by vector construction, the cost is same as sse_op on other > > > > > targets, but twice much as sse_op on CLX/SKX. > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > > > > Ok for trunk? > > > > > > > > I'm going to check in this patch if there's no objection. > > Pushed to trunk. > > /* N element inserts into SSE vectors. */ > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > + int cost > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > + ix86_cost->sse_op > + : ix86_cost->integer_to_sse); > + > > so that's costing movd and pinsr the same, shouldn't we try to separate this > by doing > > /* N element inserts into SSE vectors. */ > int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > /* Account for int->SSE reg moves. */ > if (!fp) > cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse; > > ? pinsr is only supported with SSE4+ IIRC. Note we also have > > case vec_to_scalar: > case scalar_to_vec: > return ix86_vec_cost (mode, ix86_cost->sse_op); > > where scalar_to_vec is used to cost splats and vec_to_scalar is used > to cost element extracts. Both lack costing of the move part. > > I realize we have GPR to XMM inserts which cover both the "move" and > the insert but then calling this 'integer_to_sse' is a bit odd. The extract > cost also depends on the element number for AVX2/AVX512F. The > vectorizer usually decomposes a vector fully and never does single > element extracts so the vextract128 cost amortizes. > > That said, the change leaves all targets besides skylake_cost with > not so great defaults I think. For skylake you effectively add another > sse_op for the int->SSE move plus '1' (for whatever reason). I think > that's reasonable for all targets. > > It does look a bit odd to have > > 8, /* cost of moving SSE register to intege > r. */ > COSTS_N_INSNS (1), /* cost of moving integer to sse registe > r. */ > > where sse_to_integer is used by the STV pass which mixes > CONST_N_INSNS scaled costs and unscaled costs (ick). Debugging some eventually related thing I applied the same costing as you did to skylake_cost to znver2_cost and figured that 538.imagick_r regresses by 23% at -Ofast -march=znver2 by such change. So it seems changes here indeed need careful benchmarking. Richard. > Richard. > > > > > > gcc/ChangeLog: > > > > > > > > > > PR target/99881 > > > > > * config/i386/i386.h (processor_costs): Add new member > > > > > integer_to_sse. > > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > > > > > generic_cost, core_cost): Initialize integer_to_sse same value > > > > > as sse_op. > > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost): > > > > > Use integer_to_sse instead of sse_op to calculate the cost of > > > > > vec_construct. > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > PR target/99881 > > > > > * gcc.target/i386/pr99881.c: New test. > > > > > --- > > > > > gcc/config/i386/i386.c | 6 ++- > > > > > gcc/config/i386/i386.h | 1 + > > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > > > > > 4 files changed, 81 insertions(+), 1 deletion(-) > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > > > index ff96134fb37..fbebd2d8f9a 100644 > > > > > --- a/gcc/config/i386/i386.c > > > > > +++ b/gcc/config/i386/i386.c > > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > > > > case vec_construct: > > > > > { > > > > > /* N element inserts into SSE vectors. */ > > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > > > + int cost > > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > > > > + ix86_cost->sse_op > > > > > + : ix86_cost->integer_to_sse); > > > > > + > > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */ > > > > > if (GET_MODE_BITSIZE (mode) == 256) > > > > > cost += ix86_vec_cost (mode, ix86_cost->addss); > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > > > index 0c2c93daf32..d1e1c225990 100644 > > > > > --- a/gcc/config/i386/i386.h > > > > > +++ b/gcc/config/i386/i386.h > > > > > @@ -165,6 +165,7 @@ struct processor_costs { > > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > > > > > zmm_move; > > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */ > > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed > > > > > as static + per_item * nelts. */ > > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is > > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > > > > > index ffe810f2bcb..67cfa006196 100644 > > > > > --- a/gcc/config/i386/x86-tune-costs.h > > > > > +++ b/gcc/config/i386/x86-tune-costs.h > > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > > > > > in 128bit, 256bit and 512bit */ > > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > > > > > 5, 0, /* Gather load static, per_elt. */ > > > > > 5, 0, /* Gather store static, per_elt. */ > > > > > 0, /* size of l1 cache */ > > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 0, /* size of l1 cache */ > > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 4, /* size of l1 cache. 486 has 8kB cache > > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 8, /* size of l1 cache. */ > > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 8, /* size of l1 cache. */ > > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 8, /* size of l1 cache. */ > > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 2, 2, /* Gather load static, per_elt. */ > > > > > 2, 2, /* Gather store static, per_elt. */ > > > > > 64, /* size of l1 cache. */ > > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 2, 2, /* Gather load static, per_elt. */ > > > > > 2, 2, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 5, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 64, /* size of l1 cache. */ > > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 5, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 64, /* size of l1 cache. */ > > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > 64, /* size of l1 cache. */ > > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 16, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > 12, 12, /* Gather load static, per_elt. */ > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > 16, /* size of l1 cache. */ > > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > > is 7 uops. */ > > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > > register. */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > > is 7 uops. */ > > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > > register. */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > > > > > throughput 9. Approx 7 uops do not depend on vector size and every load > > > > > is 4 uops. */ > > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > > > > > 20, 8, /* Gather load static, per_elt. */ > > > > > 22, 10, /* Gather store static, per_elt. */ > > > > > 64, /* size of l1 cache. */ > > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 20, 8, /* Gather load static, per_elt. */ > > > > > 22, 10, /* Gather store static, per_elt. */ > > > > > 64, /* size of l1 cache. */ > > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 14, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 10, 10, /* Gather load static, per_elt. */ > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 14, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 10, 10, /* Gather load static, per_elt. */ > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > > > > > 20, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > 16, 16, /* Gather load static, per_elt. */ > > > > > 16, 16, /* Gather store static, per_elt. */ > > > > > 8, /* size of l1 cache. */ > > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > > > > > 20, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > 12, 12, /* Gather load static, per_elt. */ > > > > > 12, 12, /* Gather store static, per_elt. */ > > > > > 8, /* size of l1 cache. */ > > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 8, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 8, 8, /* Gather load static, per_elt. */ > > > > > 8, 8, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > 8, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 8, 8, /* Gather load static, per_elt. */ > > > > > 8, 8, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > > > > > 4, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 6, 6, /* Gather load static, per_elt. */ > > > > > 6, 6, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > 18, 6, /* Gather load static, per_elt. */ > > > > > 18, 6, /* Gather store static, per_elt. */ > > > > > 32, /* size of l1 cache. */ > > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > 2, /* cost of moving SSE register to integer. */ > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > > > > rec. throughput 6. > > > > > So 5 uops statically and one uops per load. */ > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > > new file mode 100644 > > > > > index 00000000000..7ae51c8310d > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > > @@ -0,0 +1,49 @@ > > > > > +/* PR target/99881. */ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-options "-Ofast -march=skylake" } */ > > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > > > > > + > > > > > +void > > > > > +foo (int* __restrict a, int n, int c) > > > > > +{ > > > > > + a[0] = n; > > > > > + a[1] = c; > > > > > +} > > > > > + > > > > > +void > > > > > +foo1 (int* __restrict a, int n, int b, int c, int d) > > > > > +{ > > > > > + a[0] = n; > > > > > + a[1] = b; > > > > > + a[2] = c; > > > > > + a[3] = d; > > > > > +} > > > > > + > > > > > +void > > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > > > > > +{ > > > > > + a[0] = n; > > > > > + a[1] = b; > > > > > + a[2] = c; > > > > > + a[3] = d; > > > > > + a[4] = e; > > > > > + a[5] = f; > > > > > + a[6] = g; > > > > > + a[7] = h; > > > > > +} > > > > > + > > > > > +void > > > > > +foo3 (long long* __restrict a, long long n, long long c) > > > > > +{ > > > > > + a[0] = n; > > > > > + a[1] = c; > > > > > +} > > > > > + > > > > > +void > > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > > > > > +{ > > > > > + a[0] = n; > > > > > + a[1] = b; > > > > > + a[2] = c; > > > > > + a[3] = d; > > > > > +} > > > > > -- > > > > > 2.18.1 > > > > > > > > > > > > > > > > > -- > > > > BR, > > > > Hongtao > > > > > > > > > > > > -- > > > BR, > > > Hongtao > > > > > > > > -- > > BR, > > Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. 2021-08-03 10:20 ` Richard Biener @ 2021-08-03 11:12 ` Hongtao Liu 2021-08-09 2:52 ` Hongtao Liu 0 siblings, 1 reply; 7+ messages in thread From: Hongtao Liu @ 2021-08-03 11:12 UTC (permalink / raw) To: Richard Biener; +Cc: liuhongt, GCC Patches On Tue, Aug 3, 2021 at 6:20 PM Richard Biener <richard.guenther@gmail.com> wrote: > > On Tue, Aug 3, 2021 at 11:20 AM Richard Biener > <richard.guenther@gmail.com> wrote: > > > > On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > Correct mail list, please reply under this email. > > > > > > > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > > > Hi: > > > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX > > > > > > and CLX, which leads to worse performance in vectorization in some cases. > > > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > > > > > > which is used by vector construction, the cost is same as sse_op on other > > > > > > targets, but twice much as sse_op on CLX/SKX. > > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > > > > > Ok for trunk? > > > > > > > > > > I'm going to check in this patch if there's no objection. > > > Pushed to trunk. > > > > /* N element inserts into SSE vectors. */ > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > + int cost > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > + ix86_cost->sse_op > > + : ix86_cost->integer_to_sse); > > + > > > > so that's costing movd and pinsr the same, shouldn't we try to separate this > > by doing > > > > /* N element inserts into SSE vectors. */ > > int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > /* Account for int->SSE reg moves. */ > > if (!fp) > > cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse; > > > > ? pinsr is only supported with SSE4+ IIRC. Note we also have we have pinsrw under sse2, and pinsrb/d/q w/ sse4+. integer_to_see is an estimate to model the average overhead of each integer from gpr to sse, it can be movd + unpck,or movd + pinsr. It seems reasonable to have uniform costs for scalar_to_vec and integer_to_sse. vec_to_scalar and sse_to_integer seems to be different, sse_to_integer corresponds to movd. vec_to_scalar is vec_extract. Maybe we should rename integer_to_sse to vec_set_integer. > > > > case vec_to_scalar: > > case scalar_to_vec: > > return ix86_vec_cost (mode, ix86_cost->sse_op); > > > > where scalar_to_vec is used to cost splats and vec_to_scalar is used > > to cost element extracts. Both lack costing of the move part. > > > > I realize we have GPR to XMM inserts which cover both the "move" and > > the insert but then calling this 'integer_to_sse' is a bit odd. The extract > > cost also depends on the element number for AVX2/AVX512F. The > > vectorizer usually decomposes a vector fully and never does single > > element extracts so the vextract128 cost amortizes. > > > > That said, the change leaves all targets besides skylake_cost with > > not so great defaults I think. For skylake you effectively add another > > sse_op for the int->SSE move plus '1' (for whatever reason). I think > > that's reasonable for all targets. > > > > It does look a bit odd to have > > > > 8, /* cost of moving SSE register to intege > > r. */ > > COSTS_N_INSNS (1), /* cost of moving integer to sse registe > > r. */ > > > > where sse_to_integer is used by the STV pass which mixes > > CONST_N_INSNS scaled costs and unscaled costs (ick). > > Debugging some eventually related thing I applied the same costing > as you did to skylake_cost to znver2_cost and figured that > 538.imagick_r regresses by 23% at -Ofast -march=znver2 by such > change. So it seems changes here indeed need careful benchmarking. > > Richard. > > > Richard. > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > PR target/99881 > > > > > > * config/i386/i386.h (processor_costs): Add new member > > > > > > integer_to_sse. > > > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > > > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > > > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > > > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > > > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > > > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > > > > > > generic_cost, core_cost): Initialize integer_to_sse same value > > > > > > as sse_op. > > > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > > > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost): > > > > > > Use integer_to_sse instead of sse_op to calculate the cost of > > > > > > vec_construct. > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > PR target/99881 > > > > > > * gcc.target/i386/pr99881.c: New test. > > > > > > --- > > > > > > gcc/config/i386/i386.c | 6 ++- > > > > > > gcc/config/i386/i386.h | 1 + > > > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > > > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > > > > > > 4 files changed, 81 insertions(+), 1 deletion(-) > > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > > > > index ff96134fb37..fbebd2d8f9a 100644 > > > > > > --- a/gcc/config/i386/i386.c > > > > > > +++ b/gcc/config/i386/i386.c > > > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > > > > > case vec_construct: > > > > > > { > > > > > > /* N element inserts into SSE vectors. */ > > > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > > > > + int cost > > > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > > > > > + ix86_cost->sse_op > > > > > > + : ix86_cost->integer_to_sse); > > > > > > + > > > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */ > > > > > > if (GET_MODE_BITSIZE (mode) == 256) > > > > > > cost += ix86_vec_cost (mode, ix86_cost->addss); > > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > > > > index 0c2c93daf32..d1e1c225990 100644 > > > > > > --- a/gcc/config/i386/i386.h > > > > > > +++ b/gcc/config/i386/i386.h > > > > > > @@ -165,6 +165,7 @@ struct processor_costs { > > > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > > > > > > zmm_move; > > > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */ > > > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > > > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed > > > > > > as static + per_item * nelts. */ > > > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is > > > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > > > > > > index ffe810f2bcb..67cfa006196 100644 > > > > > > --- a/gcc/config/i386/x86-tune-costs.h > > > > > > +++ b/gcc/config/i386/x86-tune-costs.h > > > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > > > > > > in 128bit, 256bit and 512bit */ > > > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > > > > > > 5, 0, /* Gather load static, per_elt. */ > > > > > > 5, 0, /* Gather store static, per_elt. */ > > > > > > 0, /* size of l1 cache */ > > > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 0, /* size of l1 cache */ > > > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 4, /* size of l1 cache. 486 has 8kB cache > > > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 8, /* size of l1 cache. */ > > > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 8, /* size of l1 cache. */ > > > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 8, /* size of l1 cache. */ > > > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 2, 2, /* Gather load static, per_elt. */ > > > > > > 2, 2, /* Gather store static, per_elt. */ > > > > > > 64, /* size of l1 cache. */ > > > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 2, 2, /* Gather load static, per_elt. */ > > > > > > 2, 2, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 5, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 64, /* size of l1 cache. */ > > > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 5, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 64, /* size of l1 cache. */ > > > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > > > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > 64, /* size of l1 cache. */ > > > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > > > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 16, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > 12, 12, /* Gather load static, per_elt. */ > > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > > 16, /* size of l1 cache. */ > > > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > > > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > > > is 7 uops. */ > > > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > > > register. */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > > > is 7 uops. */ > > > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > > > register. */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > > > > > > throughput 9. Approx 7 uops do not depend on vector size and every load > > > > > > is 4 uops. */ > > > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > > > > > > 20, 8, /* Gather load static, per_elt. */ > > > > > > 22, 10, /* Gather store static, per_elt. */ > > > > > > 64, /* size of l1 cache. */ > > > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 20, 8, /* Gather load static, per_elt. */ > > > > > > 22, 10, /* Gather store static, per_elt. */ > > > > > > 64, /* size of l1 cache. */ > > > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 14, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 10, 10, /* Gather load static, per_elt. */ > > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 14, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 10, 10, /* Gather load static, per_elt. */ > > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > > > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 20, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > 16, 16, /* Gather load static, per_elt. */ > > > > > > 16, 16, /* Gather store static, per_elt. */ > > > > > > 8, /* size of l1 cache. */ > > > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > > > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 20, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > 12, 12, /* Gather load static, per_elt. */ > > > > > > 12, 12, /* Gather store static, per_elt. */ > > > > > > 8, /* size of l1 cache. */ > > > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 8, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 8, 8, /* Gather load static, per_elt. */ > > > > > > 8, 8, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 8, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 8, 8, /* Gather load static, per_elt. */ > > > > > > 8, 8, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > > > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 4, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 6, 6, /* Gather load static, per_elt. */ > > > > > > 6, 6, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > > > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > > > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > 18, 6, /* Gather load static, per_elt. */ > > > > > > 18, 6, /* Gather store static, per_elt. */ > > > > > > 32, /* size of l1 cache. */ > > > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > > > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > 2, /* cost of moving SSE register to integer. */ > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > > > > > rec. throughput 6. > > > > > > So 5 uops statically and one uops per load. */ > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > new file mode 100644 > > > > > > index 00000000000..7ae51c8310d > > > > > > --- /dev/null > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > @@ -0,0 +1,49 @@ > > > > > > +/* PR target/99881. */ > > > > > > +/* { dg-do compile } */ > > > > > > +/* { dg-options "-Ofast -march=skylake" } */ > > > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > > > > > > + > > > > > > +void > > > > > > +foo (int* __restrict a, int n, int c) > > > > > > +{ > > > > > > + a[0] = n; > > > > > > + a[1] = c; > > > > > > +} > > > > > > + > > > > > > +void > > > > > > +foo1 (int* __restrict a, int n, int b, int c, int d) > > > > > > +{ > > > > > > + a[0] = n; > > > > > > + a[1] = b; > > > > > > + a[2] = c; > > > > > > + a[3] = d; > > > > > > +} > > > > > > + > > > > > > +void > > > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > > > > > > +{ > > > > > > + a[0] = n; > > > > > > + a[1] = b; > > > > > > + a[2] = c; > > > > > > + a[3] = d; > > > > > > + a[4] = e; > > > > > > + a[5] = f; > > > > > > + a[6] = g; > > > > > > + a[7] = h; > > > > > > +} > > > > > > + > > > > > > +void > > > > > > +foo3 (long long* __restrict a, long long n, long long c) > > > > > > +{ > > > > > > + a[0] = n; > > > > > > + a[1] = c; > > > > > > +} > > > > > > + > > > > > > +void > > > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > > > > > > +{ > > > > > > + a[0] = n; > > > > > > + a[1] = b; > > > > > > + a[2] = c; > > > > > > + a[3] = d; > > > > > > +} > > > > > > -- > > > > > > 2.18.1 > > > > > > > > > > > > > > > > > > > > > -- > > > > > BR, > > > > > Hongtao > > > > > > > > > > > > > > > > -- > > > > BR, > > > > Hongtao > > > > > > > > > > > > -- > > > BR, > > > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. 2021-08-03 11:12 ` Hongtao Liu @ 2021-08-09 2:52 ` Hongtao Liu 0 siblings, 0 replies; 7+ messages in thread From: Hongtao Liu @ 2021-08-09 2:52 UTC (permalink / raw) To: Richard Biener; +Cc: liuhongt, GCC Patches On Tue, Aug 3, 2021 at 7:12 PM Hongtao Liu <crazylht@gmail.com> wrote: > > On Tue, Aug 3, 2021 at 6:20 PM Richard Biener > <richard.guenther@gmail.com> wrote: > > > > On Tue, Aug 3, 2021 at 11:20 AM Richard Biener > > <richard.guenther@gmail.com> wrote: > > > > > > On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > > > Correct mail list, please reply under this email. > > > > > > > > > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > > > > > Hi: > > > > > > > As decribled in PR, the pinsr instruction has poor throughput in SKX > > > > > > > and CLX, which leads to worse performance in vectorization in some cases. > > > > > > > This patch adds a cost member named integer_to_sse to simulate pinsr/movd > > > > > > > which is used by vector construction, the cost is same as sse_op on other > > > > > > > targets, but twice much as sse_op on CLX/SKX. > > > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > > > > > > Ok for trunk? > > > > > > > > > > > > I'm going to check in this patch if there's no objection. > > > > Pushed to trunk. > > > > > > /* N element inserts into SSE vectors. */ > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > + int cost > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > > + ix86_cost->sse_op > > > + : ix86_cost->integer_to_sse); > > > + > > > > > > so that's costing movd and pinsr the same, shouldn't we try to separate this > > > by doing > > > > > > /* N element inserts into SSE vectors. */ > > > int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > /* Account for int->SSE reg moves. */ > > > if (!fp) > > > cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse; > > > > > > ? pinsr is only supported with SSE4+ IIRC. Note we also have > we have pinsrw under sse2, and pinsrb/d/q w/ sse4+. > integer_to_see is an estimate to model the average overhead of each > integer from gpr to sse, it can be movd + unpck,or movd + pinsr. > It seems reasonable to have uniform costs for scalar_to_vec and integer_to_sse. > vec_to_scalar and sse_to_integer seems to be different, > sse_to_integer corresponds to movd. vec_to_scalar is vec_extract. > Maybe we should rename integer_to_sse to vec_set_integer. > > > > > > case vec_to_scalar: > > > case scalar_to_vec: > > > return ix86_vec_cost (mode, ix86_cost->sse_op); > > > > > > where scalar_to_vec is used to cost splats and vec_to_scalar is used > > > to cost element extracts. Both lack costing of the move part. > > > > > > I realize we have GPR to XMM inserts which cover both the "move" and > > > the insert but then calling this 'integer_to_sse' is a bit odd. The extract > > > cost also depends on the element number for AVX2/AVX512F. The > > > vectorizer usually decomposes a vector fully and never does single > > > element extracts so the vextract128 cost amortizes. > > > > > > That said, the change leaves all targets besides skylake_cost with > > > not so great defaults I think. For skylake you effectively add another > > > sse_op for the int->SSE move plus '1' (for whatever reason). I think > > > that's reasonable for all targets. > > > > > > It does look a bit odd to have > > > > > > 8, /* cost of moving SSE register to intege > > > r. */ > > > COSTS_N_INSNS (1), /* cost of moving integer to sse registe > > > r. */ > > > > > > where sse_to_integer is used by the STV pass which mixes > > > CONST_N_INSNS scaled costs and unscaled costs (ick). > > > > Debugging some eventually related thing I applied the same costing > > as you did to skylake_cost to znver2_cost and figured that > > 538.imagick_r regresses by 23% at -Ofast -march=znver2 by such Guess it's related to store forward stall, just like PR100076. > > change. So it seems changes here indeed need careful benchmarking. > > > > Richard. > > > > > Richard. > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > > > PR target/99881 > > > > > > > * config/i386/i386.h (processor_costs): Add new member > > > > > > > integer_to_sse. > > > > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, > > > > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, > > > > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, > > > > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost, > > > > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost, > > > > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, > > > > > > > generic_cost, core_cost): Initialize integer_to_sse same value > > > > > > > as sse_op. > > > > > > > (skylake_cost): Initialize integer_to_sse twice as much as sse_op. > > > > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost): > > > > > > > Use integer_to_sse instead of sse_op to calculate the cost of > > > > > > > vec_construct. > > > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > > > PR target/99881 > > > > > > > * gcc.target/i386/pr99881.c: New test. > > > > > > > --- > > > > > > > gcc/config/i386/i386.c | 6 ++- > > > > > > > gcc/config/i386/i386.h | 1 + > > > > > > > gcc/config/i386/x86-tune-costs.h | 26 +++++++++++++ > > > > > > > gcc/testsuite/gcc.target/i386/pr99881.c | 49 +++++++++++++++++++++++++ > > > > > > > 4 files changed, 81 insertions(+), 1 deletion(-) > > > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > > > > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > > > > > index ff96134fb37..fbebd2d8f9a 100644 > > > > > > > --- a/gcc/config/i386/i386.c > > > > > > > +++ b/gcc/config/i386/i386.c > > > > > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > > > > > > > case vec_construct: > > > > > > > { > > > > > > > /* N element inserts into SSE vectors. */ > > > > > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; > > > > > > > + int cost > > > > > > > + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? > > > > > > > + ix86_cost->sse_op > > > > > > > + : ix86_cost->integer_to_sse); > > > > > > > + > > > > > > > /* One vinserti128 for combining two SSE vectors for AVX256. */ > > > > > > > if (GET_MODE_BITSIZE (mode) == 256) > > > > > > > cost += ix86_vec_cost (mode, ix86_cost->addss); > > > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > > > > > index 0c2c93daf32..d1e1c225990 100644 > > > > > > > --- a/gcc/config/i386/i386.h > > > > > > > +++ b/gcc/config/i386/i386.h > > > > > > > @@ -165,6 +165,7 @@ struct processor_costs { > > > > > > > const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ > > > > > > > zmm_move; > > > > > > > const int sse_to_integer; /* cost of moving SSE register to integer. */ > > > > > > > + const int integer_to_sse; /* cost of moving integer to SSE register. */ > > > > > > > const int gather_static, gather_per_elt; /* Cost of gather load is computed > > > > > > > as static + per_item * nelts. */ > > > > > > > const int scatter_static, scatter_per_elt; /* Cost of gather store is > > > > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > > > > > > > index ffe810f2bcb..67cfa006196 100644 > > > > > > > --- a/gcc/config/i386/x86-tune-costs.h > > > > > > > +++ b/gcc/config/i386/x86-tune-costs.h > > > > > > > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ > > > > > > > in 128bit, 256bit and 512bit */ > > > > > > > 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ > > > > > > > 5, 0, /* Gather load static, per_elt. */ > > > > > > > 5, 0, /* Gather store static, per_elt. */ > > > > > > > 0, /* size of l1 cache */ > > > > > > > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ > > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 0, /* size of l1 cache */ > > > > > > > @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ > > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 4, /* size of l1 cache. 486 has 8kB cache > > > > > > > @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { > > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 8, /* size of l1 cache. */ > > > > > > > @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { > > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 8, /* size of l1 cache. */ > > > > > > > @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { > > > > > > > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 8, /* size of l1 cache. */ > > > > > > > @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { > > > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 2, 2, /* Gather load static, per_elt. */ > > > > > > > 2, 2, /* Gather store static, per_elt. */ > > > > > > > 64, /* size of l1 cache. */ > > > > > > > @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { > > > > > > > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 2, 2, /* Gather load static, per_elt. */ > > > > > > > 2, 2, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { > > > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 5, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 64, /* size of l1 cache. */ > > > > > > > @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { > > > > > > > {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 5, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 64, /* size of l1 cache. */ > > > > > > > @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { > > > > > > > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 3, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > > 4, 4, /* Gather load static, per_elt. */ > > > > > > > 4, 4, /* Gather store static, per_elt. */ > > > > > > > 64, /* size of l1 cache. */ > > > > > > > @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { > > > > > > > {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 16, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > > 12, 12, /* Gather load static, per_elt. */ > > > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > > > 16, /* size of l1 cache. */ > > > > > > > @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { > > > > > > > {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > > > > > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > > > > is 7 uops. */ > > > > > > > @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { > > > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > > > > register. */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > > > > > > throughput 12. Approx 9 uops do not depend on vector size and every load > > > > > > > is 7 uops. */ > > > > > > > @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { > > > > > > > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > > > > > > register. */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > > > > > > > throughput 9. Approx 7 uops do not depend on vector size and every load > > > > > > > is 4 uops. */ > > > > > > > @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { > > > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ > > > > > > > 20, 8, /* Gather load static, per_elt. */ > > > > > > > 22, 10, /* Gather store static, per_elt. */ > > > > > > > 64, /* size of l1 cache. */ > > > > > > > @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { > > > > > > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 20, 8, /* Gather load static, per_elt. */ > > > > > > > 22, 10, /* Gather store static, per_elt. */ > > > > > > > 64, /* size of l1 cache. */ > > > > > > > @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { > > > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 14, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 10, 10, /* Gather load static, per_elt. */ > > > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { > > > > > > > {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 14, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 10, 10, /* Gather load static, per_elt. */ > > > > > > > 10, 10, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { > > > > > > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > > > > > > 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 20, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > > 16, 16, /* Gather load static, per_elt. */ > > > > > > > 16, 16, /* Gather store static, per_elt. */ > > > > > > > 8, /* size of l1 cache. */ > > > > > > > @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { > > > > > > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > > > > > > 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 20, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ > > > > > > > 12, 12, /* Gather load static, per_elt. */ > > > > > > > 12, 12, /* Gather store static, per_elt. */ > > > > > > > 8, /* size of l1 cache. */ > > > > > > > @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { > > > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 8, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 8, 8, /* Gather load static, per_elt. */ > > > > > > > 8, 8, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { > > > > > > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > > > > > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 8, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 8, 8, /* Gather load static, per_elt. */ > > > > > > > 8, 8, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { > > > > > > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > > > > > > 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 4, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 6, 6, /* Gather load static, per_elt. */ > > > > > > > 6, 6, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { > > > > > > > {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ > > > > > > > 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 6, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > 18, 6, /* Gather load static, per_elt. */ > > > > > > > 18, 6, /* Gather store static, per_elt. */ > > > > > > > 32, /* size of l1 cache. */ > > > > > > > @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { > > > > > > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > > > > > > 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ > > > > > > > 2, /* cost of moving SSE register to integer. */ > > > > > > > + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ > > > > > > > /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > > > > > > rec. throughput 6. > > > > > > > So 5 uops statically and one uops per load. */ > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > > new file mode 100644 > > > > > > > index 00000000000..7ae51c8310d > > > > > > > --- /dev/null > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > > > > > > > @@ -0,0 +1,49 @@ > > > > > > > +/* PR target/99881. */ > > > > > > > +/* { dg-do compile } */ > > > > > > > +/* { dg-options "-Ofast -march=skylake" } */ > > > > > > > +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ > > > > > > > + > > > > > > > +void > > > > > > > +foo (int* __restrict a, int n, int c) > > > > > > > +{ > > > > > > > + a[0] = n; > > > > > > > + a[1] = c; > > > > > > > +} > > > > > > > + > > > > > > > +void > > > > > > > +foo1 (int* __restrict a, int n, int b, int c, int d) > > > > > > > +{ > > > > > > > + a[0] = n; > > > > > > > + a[1] = b; > > > > > > > + a[2] = c; > > > > > > > + a[3] = d; > > > > > > > +} > > > > > > > + > > > > > > > +void > > > > > > > +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) > > > > > > > +{ > > > > > > > + a[0] = n; > > > > > > > + a[1] = b; > > > > > > > + a[2] = c; > > > > > > > + a[3] = d; > > > > > > > + a[4] = e; > > > > > > > + a[5] = f; > > > > > > > + a[6] = g; > > > > > > > + a[7] = h; > > > > > > > +} > > > > > > > + > > > > > > > +void > > > > > > > +foo3 (long long* __restrict a, long long n, long long c) > > > > > > > +{ > > > > > > > + a[0] = n; > > > > > > > + a[1] = c; > > > > > > > +} > > > > > > > + > > > > > > > +void > > > > > > > +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) > > > > > > > +{ > > > > > > > + a[0] = n; > > > > > > > + a[1] = b; > > > > > > > + a[2] = c; > > > > > > > + a[3] = d; > > > > > > > +} > > > > > > > -- > > > > > > > 2.18.1 > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > BR, > > > > > > Hongtao > > > > > > > > > > > > > > > > > > > > -- > > > > > BR, > > > > > Hongtao > > > > > > > > > > > > > > > > -- > > > > BR, > > > > Hongtao > > > > -- > BR, > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-08-09 2:46 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20210726084723.49443-1-hongtao.liu@intel.com> 2021-07-26 8:49 ` [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct Hongtao Liu 2021-07-27 1:54 ` Hongtao Liu 2021-07-28 2:54 ` Hongtao Liu 2021-08-03 9:20 ` Richard Biener 2021-08-03 10:20 ` Richard Biener 2021-08-03 11:12 ` Hongtao Liu 2021-08-09 2:52 ` Hongtao Liu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).