* Add scatter/gather costs
@ 2017-10-25 19:23 Jan Hubicka
2017-10-26 7:10 ` Kumar, Venkataramanan
0 siblings, 1 reply; 3+ messages in thread
From: Jan Hubicka @ 2017-10-25 19:23 UTC (permalink / raw)
To: gcc-patches
Hi,
this patch adds computation of scatter/gather to i386 cost metric.
The costs for core are set for haswell, skylake has better implementation
so I will have to split the cost tables for cores older and younger than
skylake. I will do that as a followup.
Bootstrapped/regtested x86_64-linux, comitted.
Honza
* i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
cost correctly.
* i386.h (processor_costs): Add gather_static, gather_per_elt,
scatter_static, scatter_per_elt.
* x86-tune-costs.h: Add new cost entries.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 254073)
+++ config/i386/i386.c (working copy)
@@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve
/* We should have separate costs for unaligned loads and gather/scatter.
Do that incrementally. */
case unaligned_load:
- case vector_gather_load:
index = sse_store_index (mode);
return ix86_vec_cost (mode,
COSTS_N_INSNS
@@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve
true);
case unaligned_store:
- case vector_scatter_store:
index = sse_store_index (mode);
return ix86_vec_cost (mode,
COSTS_N_INSNS
(ix86_cost->sse_unaligned_store[index]) / 2,
true);
+ case vector_gather_load:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->gather_static
+ + ix86_cost->gather_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+ true);
+
+ case vector_scatter_store:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->scatter_static
+ + ix86_cost->scatter_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+ true);
+
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h (revision 254073)
+++ config/i386/i386.h (working copy)
@@ -253,6 +253,10 @@ struct processor_costs {
const int mmxsse_to_integer; /* cost of moving mmxsse register to
integer. */
const int ssemmx_to_integer; /* cost of moving integer to mmxsse register. */
+ const int gather_static, gather_per_elt; /* Cost of gather load is computed
+ as static + per_item * nelts. */
+ const int scatter_static, scatter_per_elt; /* Cost of gather store is
+ computed as static + per_item * nelts. */
const int l1_cache_size; /* size of l1 cache, in kilobytes. */
const int l2_cache_size; /* size of l2 cache, in kilobytes. */
const int prefetch_block; /* bytes moved to cache for prefetch. */
Index: config/i386/x86-tune-costs.h
===================================================================
--- config/i386/x86-tune-costs.h (revision 254073)
+++ config/i386/x86-tune-costs.h (working copy)
@@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost =
{3, 3, 3, 3, 3}, /* cost of unaligned SSE store
in 128bit, 256bit and 512bit */
3, 3, /* SSE->integer and integer->SSE moves */
+ 5, 0, /* Gather load static, per_elt. */
+ 5, 0, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
@@ -166,6 +168,8 @@ struct processor_costs i386_cost = { /*
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
@@ -249,6 +253,8 @@ struct processor_costs i486_cost = { /*
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
4, /* size of l1 cache. 486 has 8kB cache
shared for code and data, so 4kB is
not really precise. */
@@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
@@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
@@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost =
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache */
32, /* size of prefetch block */
@@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves */
+ 2, 2, /* Gather load static, per_elt. */
+ 2, 2, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
128, /* size of l2 cache. */
32, /* size of prefetch block */
@@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves */
+ 2, 2, /* Gather load static, per_elt. */
+ 2, 2, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
32, /* size of l2 cache. Some models
have integrated l2 cache, but
@@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
5, 5, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
5, 5, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves. */
+ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+ throughput 12. Approx 9 uops do not depend on vector size and every load
+ is 7 uops. */
+ 18, 8, /* Gather load static, per_elt. */
+ 18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
@@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
in 32,64,128,256 and 512-bit */
{10, 10, 12, 24, 48}, /* cost of unaligned stores. */
14, 14, /* SSE->integer and integer->SSE moves */
+ 10, 10, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
in 32,64,128,256 and 512-bit */
{10, 10, 12, 24, 48}, /* cost of unaligned stores. */
14, 14, /* SSE->integer and integer->SSE moves */
+ 10, 10, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
in 32,64,128,256 and 512-bit */
{32, 32, 32, 64, 128}, /* cost of unaligned stores. */
20, 12, /* SSE->integer and integer->SSE moves */
+ 16, 16, /* Gather load static, per_elt. */
+ 16, 16, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
in 32,64,128,256 and 512-bit */
{24, 24, 24, 48, 96}, /* cost of unaligned stores. */
20, 12, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 12, 12, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
1024, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
8, 6, /* SSE->integer and integer->SSE moves */
+ 8, 8, /* Gather load static, per_elt. */
+ 8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
8, 6, /* SSE->integer and integer->SSE moves */
+ 8, 8, /* Gather load static, per_elt. */
+ 8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 10, 10}, /* cost of unaligned loads. */
4, 4, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* Gather load static, per_elt. */
+ 6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 15, 20}, /* cost of unaligned storess. */
20, 20, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* Gather load static, per_elt. */
+ 6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
in 32,64,128,256 and 512-bit */
{6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2, 2, /* SSE->integer and integer->SSE moves */
+ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+ rec. throughput 6.
+ So 5 uops statically and one uops per load. */
+ 10, 6, /* Gather load static, per_elt. */
+ 10, 6, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
^ permalink raw reply [flat|nested] 3+ messages in thread
* RE: Add scatter/gather costs
2017-10-25 19:23 Add scatter/gather costs Jan Hubicka
@ 2017-10-26 7:10 ` Kumar, Venkataramanan
2017-10-26 8:12 ` Jan Hubicka
0 siblings, 1 reply; 3+ messages in thread
From: Kumar, Venkataramanan @ 2017-10-26 7:10 UTC (permalink / raw)
To: Jan Hubicka, gcc-patches
Hi Honza,
> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Jan Hubicka
> Sent: Thursday, October 26, 2017 12:49 AM
> To: gcc-patches@gcc.gnu.org
> Subject: Add scatter/gather costs
>
> Hi,
> this patch adds computation of scatter/gather to i386 cost metric.
> The costs for core are set for haswell, skylake has better implementation so I
> will have to split the cost tables for cores older and younger than skylake. I
> will do that as a followup.
>
> Bootstrapped/regtested x86_64-linux, comitted.
>
> Honza
>
> * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
> cost correctly.
> * i386.h (processor_costs): Add gather_static, gather_per_elt,
> scatter_static, scatter_per_elt.
> * x86-tune-costs.h: Add new cost entries.
> Index: config/i386/i386.c
> ==========================================================
> =========
> --- config/i386/i386.c (revision 254073)
> +++ config/i386/i386.c (working copy)
> @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve
> /* We should have separate costs for unaligned loads and gather/scatter.
> Do that incrementally. */
> case unaligned_load:
> - case vector_gather_load:
> index = sse_store_index (mode);
> return ix86_vec_cost (mode,
> COSTS_N_INSNS
> @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve
> true);
>
> case unaligned_store:
> - case vector_scatter_store:
> index = sse_store_index (mode);
> return ix86_vec_cost (mode,
> COSTS_N_INSNS
> (ix86_cost->sse_unaligned_store[index]) / 2,
> true);
>
> + case vector_gather_load:
> + return ix86_vec_cost (mode,
> + COSTS_N_INSNS
> + (ix86_cost->gather_static
> + + ix86_cost->gather_per_elt
> + * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> + true);
> +
> + case vector_scatter_store:
> + return ix86_vec_cost (mode,
> + COSTS_N_INSNS
> + (ix86_cost->scatter_static
> + + ix86_cost->scatter_per_elt
> + * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> + true);
> +
> case cond_branch_taken:
> return ix86_cost->cond_taken_branch_cost;
>
> Index: config/i386/i386.h
> ==========================================================
> =========
> --- config/i386/i386.h (revision 254073)
> +++ config/i386/i386.h (working copy)
> @@ -253,6 +253,10 @@ struct processor_costs {
> const int mmxsse_to_integer; /* cost of moving mmxsse register to
> integer. */
> const int ssemmx_to_integer; /* cost of moving integer to mmxsse
> register. */
> + const int gather_static, gather_per_elt; /* Cost of gather load is computed
> + as static + per_item * nelts. */
> + const int scatter_static, scatter_per_elt; /* Cost of gather store is
> + computed as static + per_item * nelts. */
> const int l1_cache_size; /* size of l1 cache, in kilobytes. */
> const int l2_cache_size; /* size of l2 cache, in kilobytes. */
> const int prefetch_block; /* bytes moved to cache for prefetch. */
> Index: config/i386/x86-tune-costs.h
> ==========================================================
> =========
> --- config/i386/x86-tune-costs.h (revision 254073)
> +++ config/i386/x86-tune-costs.h (working copy)
> @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost =
> {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
> in 128bit, 256bit and 512bit */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 5, 0, /* Gather load static, per_elt. */
> + 5, 0, /* Gather store static, per_elt. */
> 0, /* size of l1 cache */
> 0, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -166,6 +168,8 @@ struct processor_costs i386_cost = { /*
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 0, /* size of l1 cache */
> 0, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -249,6 +253,8 @@ struct processor_costs i486_cost = { /*
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 4, /* size of l1 cache. 486 has 8kB cache
> shared for code and data, so 4kB is
> not really precise. */
> @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 8, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 8, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost =
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 256, /* size of l2 cache */
> 32, /* size of prefetch block */
> @@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
> in 32,64,128,256 and 512-bit */
> {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> 6, 6, /* SSE->integer and integer->SSE
> moves */
> + 2, 2, /* Gather load static, per_elt. */
> + 2, 2, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 128, /* size of l2 cache. */
> 32, /* size of prefetch block */
> @@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
> in 32,64,128,256 and 512-bit */
> {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> 6, 6, /* SSE->integer and integer->SSE
> moves */
> + 2, 2, /* Gather load static, per_elt. */
> + 2, 2, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 32, /* size of l2 cache. Some models
> have integrated l2 cache, but
> @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> 5, 5, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> 5, 5, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
> 1/1 1/1
> MOVD reg32, xmmreg Double
> FADD 3
> 1/1 1/1 */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
> in 32,64,128,256 and 512-bit. */
> {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> 6, 6, /* SSE->integer and integer->SSE
> moves. */
> + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> + throughput 12. Approx 9 uops do not depend on vector size and every
> load
> + is 7 uops. */
> + 18, 8, /* Gather load static, per_elt. */
> + 18, 10, /* Gather store static, per_elt. */
Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
Per_elt is 8 i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
> 32, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block. */
> @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
> 14, 14, /* SSE->integer and integer->SSE
> moves */
> + 10, 10, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
> 14, 14, /* SSE->integer and integer->SSE
> moves */
> + 10, 10, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
> in 32,64,128,256 and 512-bit */
> {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> 20, 12, /* SSE->integer and integer->SSE
> moves */
> + 16, 16, /* Gather load static, per_elt. */
> + 16, 16, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
> in 32,64,128,256 and 512-bit */
> {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> 20, 12, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 12, 12, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 1024, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
> in 32,64,128,256 and 512-bit */
> {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> 8, 6, /* SSE->integer and integer->SSE
> moves */
> + 8, 8, /* Gather load static, per_elt. */
> + 8, 8, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
> in 32,64,128,256 and 512-bit */
> {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> 8, 6, /* SSE->integer and integer->SSE
> moves */
> + 8, 8, /* Gather load static, per_elt. */
> + 8, 8, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> 4, 4, /* SSE->integer and integer->SSE
> moves */
> + 6, 6, /* Gather load static, per_elt. */
> + 6, 6, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 15, 20}, /* cost of unaligned storess. */
> 20, 20, /* SSE->integer and integer->SSE
> moves */
> + 6, 6, /* Gather load static, per_elt. */
> + 6, 6, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
> in 32,64,128,256 and 512-bit */
> {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> 2, 2, /* SSE->integer and integer->SSE
> moves */
> + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> + rec. throughput 6.
> + So 5 uops statically and one uops per load. */
> + 10, 6, /* Gather load static, per_elt. */
> + 10, 6, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
Regards,
Venkat.
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Add scatter/gather costs
2017-10-26 7:10 ` Kumar, Venkataramanan
@ 2017-10-26 8:12 ` Jan Hubicka
0 siblings, 0 replies; 3+ messages in thread
From: Jan Hubicka @ 2017-10-26 8:12 UTC (permalink / raw)
To: Kumar, Venkataramanan; +Cc: gcc-patches
> Hi Honza,
>
> > + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > + throughput 12. Approx 9 uops do not depend on vector size and every
> > load
> > + is 7 uops. */
> > + 18, 8, /* Gather load static, per_elt. */
> > + 18, 10, /* Gather store static, per_elt. */
>
> Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
> Per_elt is 8 i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
From the number of uops it seemed that gather is roughly 9+7*n where n is number of
entries. reg-reg move is 2, so 18 is 9*2. I think we need to account that CPU
is indeed doing n independent load operations (so it does not save anything compared
to scalar code) and bit more. Load cost is set to 6 (perhaps it should be 8 for
integer and more for FP?). So I went for 8 to make it bit more expensive.
I plan to experiment with the values incrementally so any suggestions are welcome.
Honza
>
>
> > 32, /* size of l1 cache. */
> > 512, /* size of l2 cache. */
> > 64, /* size of prefetch block. */
> > @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
> > in 32,64,128,256 and 512-bit */
> > {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
> > 14, 14, /* SSE->integer and integer->SSE
> > moves */
> > + 10, 10, /* Gather load static, per_elt. */
> > + 10, 10, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > 512, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
> > in 32,64,128,256 and 512-bit */
> > {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
> > 14, 14, /* SSE->integer and integer->SSE
> > moves */
> > + 10, 10, /* Gather load static, per_elt. */
> > + 10, 10, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > 2048, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
> > in 32,64,128,256 and 512-bit */
> > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> > 20, 12, /* SSE->integer and integer->SSE
> > moves */
> > + 16, 16, /* Gather load static, per_elt. */
> > + 16, 16, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > 256, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
> > in 32,64,128,256 and 512-bit */
> > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> > 20, 12, /* SSE->integer and integer->SSE
> > moves */
> > + 12, 12, /* Gather load static, per_elt. */
> > + 12, 12, /* Gather store static, per_elt. */
> > 8, /* size of l1 cache. */
> > 1024, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
> > in 32,64,128,256 and 512-bit */
> > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > 8, 6, /* SSE->integer and integer->SSE
> > moves */
> > + 8, 8, /* Gather load static, per_elt. */
> > + 8, 8, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > 256, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
> > in 32,64,128,256 and 512-bit */
> > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> > 8, 6, /* SSE->integer and integer->SSE
> > moves */
> > + 8, 8, /* Gather load static, per_elt. */
> > + 8, 8, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > 256, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
> > in 32,64,128,256 and 512-bit */
> > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> > 4, 4, /* SSE->integer and integer->SSE
> > moves */
> > + 6, 6, /* Gather load static, per_elt. */
> > + 6, 6, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > 256, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
> > in 32,64,128,256 and 512-bit */
> > {10, 10, 10, 15, 20}, /* cost of unaligned storess. */
> > 20, 20, /* SSE->integer and integer->SSE
> > moves */
> > + 6, 6, /* Gather load static, per_elt. */
> > + 6, 6, /* Gather store static, per_elt. */
> > 32, /* size of l1 cache. */
> > 512, /* size of l2 cache. */
> > 64, /* size of prefetch block */
> > @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
> > in 32,64,128,256 and 512-bit */
> > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> > 2, 2, /* SSE->integer and integer->SSE
> > moves */
> > + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > + rec. throughput 6.
> > + So 5 uops statically and one uops per load. */
> > + 10, 6, /* Gather load static, per_elt. */
> > + 10, 6, /* Gather store static, per_elt. */
> > 64, /* size of l1 cache. */
> > 512, /* size of l2 cache. */
> > 64, /* size of prefetch block */
>
> Regards,
> Venkat.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2017-10-26 7:34 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-25 19:23 Add scatter/gather costs Jan Hubicka
2017-10-26 7:10 ` Kumar, Venkataramanan
2017-10-26 8:12 ` Jan Hubicka
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).