From: "Cui, Lili" <lili.cui@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: Lili Cui <lili.cui@intel.com>
Subject: [PATCH 2/2] Add a tune option to control the length of the chain with FMA
Date: Thu, 11 May 2023 10:12:01 +0000 [thread overview]
Message-ID: <20230511101201.2052667-2-lili.cui@intel.com> (raw)
In-Reply-To: <20230511101201.2052667-1-lili.cui@intel.com>
From: Lili Cui <lili.cui@intel.com>
Set the length of the chain with FMA to 5 for icelake_cost.
With this patch applied,
SPR multi-copy: 508.namd_r increased by 3%
ICX multi-copy: 508.namd_r increased by 3.5%,
507.cactuBSSN_r increased by 3.7%
Using FMA instead of mult + add reduces register pressure and insruction
retired.
gcc/ChangeLog:
* config/i386/i386-options.cc (ix86_option_override_internal):
Set param_max_reassoc_fma_chain_length.
* config/i386/i386.h (struct processor_costs): Add new tune parameters.
* config/i386/x86-tune-costs.h (struct processor_costs): Set
reassoc_max_chain_length_with_fma to 5 for icelake.
gcc/testsuite/ChangeLog:
* gcc.target/i386/fma-chain.c: New test.
---
gcc/config/i386/i386-options.cc | 2 ++
gcc/config/i386/i386.h | 3 ++
gcc/config/i386/x86-tune-costs.h | 35 +++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++++++
4 files changed, 51 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..67d35d89d91 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2684,6 +2684,8 @@ ix86_option_override_internal (bool main_args_p,
ix86_tune_cost->l1_cache_size);
SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
ix86_tune_cost->l2_cache_size);
+ SET_OPTION_IF_UNSET (opts, opts_set, param_reassoc_max_chain_length_with_fma,
+ ix86_tune_cost->reassoc_max_chain_length_with_fma);
/* 64B is the accepted value for these for all x86. */
SET_OPTION_IF_UNSET (&global_options, &global_options_set,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f89bdf..c7fa7312a67 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -206,6 +206,9 @@ struct processor_costs {
to number of instructions executed in
parallel. See also
ix86_reassociation_width. */
+ const int reassoc_max_chain_length_with_fma;
+ /* Specify max reassociation chain length with
+ FMA. */
struct stringop_algs *memcpy, *memset;
const int cond_taken_branch_cost; /* Cost of taken branch for vectorizer
cost model. */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4f7a67ca5c5..1f57a5ee2a7 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -127,6 +127,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
ix86_size_memcpy,
ix86_size_memset,
COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
@@ -238,6 +239,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
i386_memcpy,
i386_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -350,6 +352,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
i486_memcpy,
i486_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -460,6 +463,7 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
pentium_memcpy,
pentium_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -563,6 +567,7 @@ struct processor_costs lakemont_cost = {
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
pentium_memcpy,
pentium_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -681,6 +686,7 @@ struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
pentiumpro_memcpy,
pentiumpro_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -790,6 +796,7 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
geode_memcpy,
geode_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -902,6 +909,7 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
k6_memcpy,
k6_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -1015,6 +1023,7 @@ struct processor_costs athlon_cost = {
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
athlon_memcpy,
athlon_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -1137,6 +1146,7 @@ struct processor_costs k8_cost = {
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
k8_memcpy,
k8_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -1267,6 +1277,7 @@ struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
amdfam10_memcpy,
amdfam10_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
@@ -1390,6 +1401,7 @@ const struct processor_costs bdver_cost = {
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
bdver_memcpy,
bdver_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1545,6 +1557,7 @@ struct processor_costs znver1_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
znver1_memcpy,
znver1_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1704,6 +1717,7 @@ struct processor_costs znver2_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1838,6 +1852,7 @@ struct processor_costs znver3_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1974,6 +1989,7 @@ struct processor_costs znver4_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -2100,6 +2116,7 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
skylake_memcpy,
skylake_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -2228,6 +2245,12 @@ struct processor_costs icelake_cost = {
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ /* Icelake-server prefers fma chains instead of breaking dependencies into
+ mult + add, which can reduce instruction retired. 1 means not to keep
+ the fma chain. When the value big than 1, we will generate fma chain.
+ When the actual fma chain length is greater than this value, the fma
+ chain will be split with width. */
+ 5, /* Reassoc max FMA chain length. */
icelake_memcpy,
icelake_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -2350,6 +2373,7 @@ struct processor_costs alderlake_cost = {
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
alderlake_memcpy,
alderlake_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -2465,6 +2489,7 @@ const struct processor_costs btver1_cost = {
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
btver1_memcpy,
btver1_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
@@ -2577,6 +2602,7 @@ const struct processor_costs btver2_cost = {
COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
btver2_memcpy,
btver2_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
@@ -2688,6 +2714,7 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
pentium4_memcpy,
pentium4_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -2802,6 +2829,7 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
nocona_memcpy,
nocona_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -2914,6 +2942,7 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
atom_memcpy,
atom_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3026,6 +3055,7 @@ struct processor_costs slm_cost = {
COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
slm_memcpy,
slm_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3152,6 +3182,7 @@ struct processor_costs tremont_cost = {
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
tremont_memcpy,
tremont_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -3264,6 +3295,7 @@ struct processor_costs intel_cost = {
COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
intel_memcpy,
intel_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3381,6 +3413,7 @@ struct processor_costs lujiazui_cost = {
COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
lujiazui_memcpy,
lujiazui_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -3502,6 +3535,7 @@ struct processor_costs generic_cost = {
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
generic_memcpy,
generic_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -3630,6 +3664,7 @@ struct processor_costs core_cost = {
COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ 1, /* Reassoc max FMA chain length. */
core_memcpy,
core_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
diff --git a/gcc/testsuite/gcc.target/i386/fma-chain.c b/gcc/testsuite/gcc.target/i386/fma-chain.c
new file mode 100644
index 00000000000..9de61f1b6ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma-chain.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=icelake-server -Wno-attributes " } */
+
+/* Test that the compiler properly optimizes multiply and add
+ to generate more FMA instructions. */
+float
+foo (float a, float b, float c, float d, float e, float f, float g, float h, float j)
+{
+ return a * b + c * d + e * f + g * h + j;
+}
+/* { dg-final { scan-assembler-times "vfm" 4 } } */
--
2.25.1
next prev parent reply other threads:[~2023-05-11 10:12 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-05-11 10:12 [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass Cui, Lili
2023-05-11 10:12 ` Cui, Lili [this message]
2023-05-11 10:56 ` [PATCH 2/2] Add a tune option to control the length of the chain with FMA Richard Biener
2023-05-11 10:52 ` [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass Richard Biener
2023-05-11 15:18 ` Cui, Lili
2023-05-12 6:04 ` Richard Biener
2023-05-12 9:04 ` Cui, Lili
2023-05-15 13:12 ` Richard Biener
2023-05-17 13:05 ` Cui, Lili
2023-05-22 13:15 ` Richard Biener
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230511101201.2052667-2-lili.cui@intel.com \
--to=lili.cui@intel.com \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).