public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Richard Sandiford <richard.sandiford@arm.com>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>
Subject: Re: [PATCH] AArch64: Add fma_reassoc_width [PR107413]
Date: Tue, 22 Nov 2022 07:58:15 +0000	[thread overview]
Message-ID: <mpt5yf7beug.fsf@arm.com> (raw)
In-Reply-To: <PAWPR08MB89824348B31E96B4F6432A3C833E9@PAWPR08MB8982.eurprd08.prod.outlook.com> (Wilco Dijkstra's message of "Wed, 9 Nov 2022 12:40:05 +0000")

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Add a reassocation width for FMAs in per-CPU tuning structures. Keep the
> existing setting for cores with 2 FMA pipes, and use 4 for cores with 4
> FMA pipes.  This improves SPECFP2017 on Neoverse V1 by ~1.5%.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/
>         PR 107413
>         * config/aarch64/aarch64.cc (struct tune_params): Add
>         fma_reassoc_width to all CPU tuning structures.
>         * config/aarch64/aarch64-protos.h (struct tune_params): Add
>         fma_reassoc_width.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index a73bfa20acb9b92ae0475794c3f11c67d22feb97..71365a446007c26b906b61ca8b2a68ee06c83037 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -540,6 +540,7 @@ struct tune_params
>    const char *loop_align;
>    int int_reassoc_width;
>    int fp_reassoc_width;
> +  int fma_reassoc_width;
>    int vec_reassoc_width;
>    int min_div_recip_mul_sf;
>    int min_div_recip_mul_df;
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 798363bcc449c414de5bbb4f26b8e1c64a0cf71a..643162cdecd6a8fe5587164cb2d0d62b709a491d 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings =
>    "4", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings =
>    "8",  /* loop_align.  */
>    2,    /* int_reassoc_width.  */
>    4,    /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,    /* vec_reassoc_width.  */
>    2,    /* min_div_recip_mul_sf.  */
>    2,    /* min_div_recip_mul_df.  */
> @@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings =
>    "16",        /* loop_align.  */
>    3,   /* int_reassoc_width.  */
>    2,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings =
>    "16",        /* loop_align.  */
>    3,   /* int_reassoc_width.  */
>    2,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2089,6 +2107,7 @@ static const struct tune_params neoversev1_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  4,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2226,6 +2245,7 @@ static const struct tune_params neoverse512tvb_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  4,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2414,6 +2434,7 @@ static const struct tune_params neoversen2_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2603,6 +2624,7 @@ static const struct tune_params neoversev2_tunings =
>    "32:16",     /* loop_align.  */
>    3,   /* int_reassoc_width.  */
>    6,   /* fp_reassoc_width.  */
> +  4,   /* fma_reassoc_width.  */
>    3,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2638,6 +2660,7 @@ static const struct tune_params a64fx_tunings =
>    "32",        /* loop_align.  */
>    4,   /* int_reassoc_width.  */
>    2,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -3350,9 +3373,10 @@ aarch64_reassociation_width (unsigned opc, machine_mode mode)
>      return aarch64_tune_params.vec_reassoc_width;
>    if (INTEGRAL_MODE_P (mode))
>      return aarch64_tune_params.int_reassoc_width;
> -  /* Avoid reassociating floating point addition so we emit more FMAs.  */
> -  if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
> -    return aarch64_tune_params.fp_reassoc_width;
> +  /* FMA's can have a different reassociation width.  */
> +  if (FLOAT_MODE_P (mode))
> +    return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
> +                           : aarch64_tune_params.fp_reassoc_width;
>    return 1;
>  }

I guess an obvious question is: if 1 (rather than 2) was the right value
for cores with 2 FMA pipes, why is 4 the right value for cores with 4 FMA
pipes?  It would be good to clarify how, conceptually, the core property
should map to the fma_reassoc_width value.

It sounds from the existing comment like the main motivation for returning 1
was to encourage more FMAs to be formed, rather than to prevent FMAs from
being reassociated.  Is that no longer an issue?  Or is the point that,
with more FMA pipes, lower FMA formation is a price worth paying for
the better parallelism we get when FMAs can be formed?

Does this code ever see opc == FMA?

Thanks,
Richard

  reply	other threads:[~2022-11-22  7:58 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-09 12:40 Wilco Dijkstra
2022-11-22  7:58 ` Richard Sandiford [this message]
2022-11-22  8:28   ` Richard Biener
2022-11-22 10:35   ` Wilco Dijkstra
2022-11-22 14:13     ` Richard Sandiford
2022-11-23 18:10       ` Wilco Dijkstra
2022-11-24  7:06         ` Richard Sandiford

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=mpt5yf7beug.fsf@arm.com \
    --to=richard.sandiford@arm.com \
    --cc=Wilco.Dijkstra@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).