public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: "Cui, Lili" <lili.cui@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: Lili Cui <lili.cui@intel.com>
Subject: [PATCH 2/2] Add a tune option to control the length of the chain with FMA
Date: Thu, 11 May 2023 10:12:01 +0000	[thread overview]
Message-ID: <20230511101201.2052667-2-lili.cui@intel.com> (raw)
In-Reply-To: <20230511101201.2052667-1-lili.cui@intel.com>

From: Lili Cui <lili.cui@intel.com>

Set the length of the chain with FMA to 5 for icelake_cost.

With this patch applied,
SPR multi-copy: 508.namd_r increased by 3%
ICX multi-copy: 508.namd_r increased by 3.5%,
                507.cactuBSSN_r increased by 3.7%

Using FMA instead of mult + add reduces register pressure and insruction
retired.

gcc/ChangeLog:

        * config/i386/i386-options.cc (ix86_option_override_internal):
        Set param_max_reassoc_fma_chain_length.
        * config/i386/i386.h (struct processor_costs): Add new tune parameters.
        * config/i386/x86-tune-costs.h (struct processor_costs): Set
	reassoc_max_chain_length_with_fma to 5 for icelake.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/fma-chain.c: New test.
---
 gcc/config/i386/i386-options.cc           |  2 ++
 gcc/config/i386/i386.h                    |  3 ++
 gcc/config/i386/x86-tune-costs.h          | 35 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++++++
 4 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..67d35d89d91 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2684,6 +2684,8 @@ ix86_option_override_internal (bool main_args_p,
 		       ix86_tune_cost->l1_cache_size);
   SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
 		       ix86_tune_cost->l2_cache_size);
+  SET_OPTION_IF_UNSET (opts, opts_set, param_reassoc_max_chain_length_with_fma,
+		       ix86_tune_cost->reassoc_max_chain_length_with_fma);
 
   /* 64B is the accepted value for these for all x86.  */
   SET_OPTION_IF_UNSET (&global_options, &global_options_set,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f89bdf..c7fa7312a67 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -206,6 +206,9 @@ struct processor_costs {
 				   to number of instructions executed in
 				   parallel.  See also
 				   ix86_reassociation_width.  */
+  const int reassoc_max_chain_length_with_fma;
+				/* Specify max reassociation chain length with
+				   FMA.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
 					  cost model.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4f7a67ca5c5..1f57a5ee2a7 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -127,6 +127,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
@@ -238,6 +239,7 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -350,6 +352,7 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -460,6 +463,7 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -563,6 +567,7 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -681,6 +686,7 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -790,6 +796,7 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   geode_memcpy,
   geode_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -902,6 +909,7 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   k6_memcpy,
   k6_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1015,6 +1023,7 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   athlon_memcpy,
   athlon_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1137,6 +1146,7 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   k8_memcpy,
   k8_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1267,6 +1277,7 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   amdfam10_memcpy,
   amdfam10_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -1390,6 +1401,7 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   bdver_memcpy,
   bdver_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1545,6 +1557,7 @@ struct processor_costs znver1_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver1_memcpy,
   znver1_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1704,6 +1717,7 @@ struct processor_costs znver2_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1838,6 +1852,7 @@ struct processor_costs znver3_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1974,6 +1989,7 @@ struct processor_costs znver4_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2100,6 +2116,7 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   skylake_memcpy,
   skylake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2228,6 +2245,12 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  /* Icelake-server prefers fma chains instead of breaking dependencies into
+     mult + add, which can reduce instruction retired. 1 means not to keep
+     the fma chain. When the value big than 1, we will generate fma chain.
+     When the actual fma chain length is greater than this value, the fma
+     chain will be split with width.  */
+  5,					/* Reassoc max FMA chain length.  */
   icelake_memcpy,
   icelake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2350,6 +2373,7 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   alderlake_memcpy,
   alderlake_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2465,6 +2489,7 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   btver1_memcpy,
   btver1_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2577,6 +2602,7 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   btver2_memcpy,
   btver2_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2688,6 +2714,7 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentium4_memcpy,
   pentium4_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2802,6 +2829,7 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   nocona_memcpy,
   nocona_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2914,6 +2942,7 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   atom_memcpy,
   atom_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3026,6 +3055,7 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   slm_memcpy,
   slm_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3152,6 +3182,7 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   tremont_memcpy,
   tremont_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3264,6 +3295,7 @@ struct processor_costs intel_cost = {
   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   intel_memcpy,
   intel_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3381,6 +3413,7 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (60),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   lujiazui_memcpy,
   lujiazui_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3502,6 +3535,7 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   generic_memcpy,
   generic_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3630,6 +3664,7 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   core_memcpy,
   core_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
diff --git a/gcc/testsuite/gcc.target/i386/fma-chain.c b/gcc/testsuite/gcc.target/i386/fma-chain.c
new file mode 100644
index 00000000000..9de61f1b6ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma-chain.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=icelake-server -Wno-attributes " } */
+
+/* Test that the compiler properly optimizes multiply and add
+   to generate more FMA instructions.  */
+float
+foo (float a, float b, float c, float d, float e, float f, float g, float h, float j)
+{
+   return a * b + c * d + e * f + g * h + j;
+}
+/* { dg-final { scan-assembler-times "vfm" 4 } } */
-- 
2.25.1


  reply	other threads:[~2023-05-11 10:12 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-11 10:12 [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass Cui, Lili
2023-05-11 10:12 ` Cui, Lili [this message]
2023-05-11 10:56   ` [PATCH 2/2] Add a tune option to control the length of the chain with FMA Richard Biener
2023-05-11 10:52 ` [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass Richard Biener
2023-05-11 15:18   ` Cui, Lili
2023-05-12  6:04     ` Richard Biener
2023-05-12  9:04       ` Cui, Lili
2023-05-15 13:12         ` Richard Biener
2023-05-17 13:05           ` Cui, Lili
2023-05-22 13:15             ` Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230511101201.2052667-2-lili.cui@intel.com \
    --to=lili.cui@intel.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).