From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7879) id C3A573858417; Wed, 15 Feb 2023 10:24:09 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C3A573858417 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1676456649; bh=y/DVciuvc7k+rCNvzQutb2UoOTOktEEnHchDDCowprc=; h=From:To:Subject:Date:From; b=CPmmYZlG3VIYJAzYaZwyWg7ix+vg2H7v2Cncz0CL8MCi0wxv2R+6KPoY1un9vyPah j0HvE5+9YpK+I6+JJjOsuCUCmJzQVNEli8mgxlk1qEiWZGgkCOSKkNuLw1rzJUlvzX oyctbFUZ24KTl0zZcwNPY1ncWUIVcXp0HWtqbFeY= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Filip Kastl To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/pheeck/heads/sccp)] AArch64: Add fma_reassoc_width [PR107413] X-Act-Checkin: gcc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/users/pheeck/heads/sccp X-Git-Oldrev: 50602717e9c3abcd443a94f69bf8c4df3ec24a02 X-Git-Newrev: b4f00f23beede52b8174b0df40be347f4a37ff65 Message-Id: <20230215102409.C3A573858417@sourceware.org> Date: Wed, 15 Feb 2023 10:24:09 +0000 (GMT) List-Id: https://gcc.gnu.org/g:b4f00f23beede52b8174b0df40be347f4a37ff65 commit b4f00f23beede52b8174b0df40be347f4a37ff65 Author: Wilco Dijkstra Date: Wed Nov 23 17:27:19 2022 +0000 AArch64: Add fma_reassoc_width [PR107413] Add a reassocation width for FMA in per-CPU tuning structures. Keep the existing setting of 1 for cores with 2 FMA pipes (this disables reassociation), and use 4 for cores with 4 FMA pipes. This improves SPECFP2017 on Neoverse V1 by ~1.5%. gcc/ PR tree-optimization/107413 * config/aarch64/aarch64.cc (struct tune_params): Add fma_reassoc_width to all CPU tuning structures. (aarch64_reassociation_width): Use fma_reassoc_width. * config/aarch64/aarch64-protos.h (struct tune_params): Add fma_reassoc_width. Diff: --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.cc | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 238820581c5..4be93c93c26 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -540,6 +540,7 @@ struct tune_params const char *loop_align; int int_reassoc_width; int fp_reassoc_width; + int fma_reassoc_width; int vec_reassoc_width; int min_div_recip_mul_sf; int min_div_recip_mul_df; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index c91df6f5006..15d478c77ce 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings = "4", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings = "8", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings = "16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings = "16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings = "16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings = "16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings = "16", /* loop_align. */ 3, /* int_reassoc_width. */ 2, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings = "16", /* loop_align. */ 3, /* int_reassoc_width. */ 2, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -1949,6 +1967,7 @@ static const struct tune_params ampere1a_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -2126,6 +2145,7 @@ static const struct tune_params neoversev1_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 4, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -2263,6 +2283,7 @@ static const struct tune_params neoverse512tvb_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 4, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -2451,6 +2472,7 @@ static const struct tune_params neoversen2_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -2640,6 +2662,7 @@ static const struct tune_params neoversev2_tunings = "32:16", /* loop_align. */ 3, /* int_reassoc_width. */ 6, /* fp_reassoc_width. */ + 4, /* fma_reassoc_width. */ 3, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -2675,6 +2698,7 @@ static const struct tune_params a64fx_tunings = "32", /* loop_align. */ 4, /* int_reassoc_width. */ 2, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ @@ -3387,9 +3411,15 @@ aarch64_reassociation_width (unsigned opc, machine_mode mode) return aarch64_tune_params.vec_reassoc_width; if (INTEGRAL_MODE_P (mode)) return aarch64_tune_params.int_reassoc_width; - /* Avoid reassociating floating point addition so we emit more FMAs. */ - if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR) - return aarch64_tune_params.fp_reassoc_width; + /* Reassociation reduces the number of FMAs which may result in worse + performance. Use a per-CPU setting for FMA reassociation which allows + narrow CPUs with few FP pipes to switch it off (value of 1), and wider + CPUs with many FP pipes to enable reassociation. + Since the reassociation pass doesn't understand FMA at all, assume + that any FP addition might turn into FMA. */ + if (FLOAT_MODE_P (mode)) + return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width + : aarch64_tune_params.fp_reassoc_width; return 1; }