From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by sourceware.org (Postfix) with ESMTP id 36CC43858D1E for ; Wed, 16 Mar 2022 16:09:57 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 36CC43858D1E Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id C12B11476; Wed, 16 Mar 2022 09:09:56 -0700 (PDT) Received: from localhost (e121540-lin.manchester.arm.com [10.32.98.88]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 2C0E23F7D7; Wed, 16 Mar 2022 09:09:56 -0700 (PDT) From: Richard Sandiford To: "Andre Vieira \(lists\)" Mail-Followup-To: "Andre Vieira \(lists\)" , "gcc-patches\@gcc.gnu.org" , Kyrylo Tkachov , richard.sandiford@arm.com Cc: "gcc-patches\@gcc.gnu.org" , Kyrylo Tkachov Subject: Re: [aarch64] Add Neoverse N2 tuning structs References: Date: Wed, 16 Mar 2022 16:09:54 +0000 In-Reply-To: (Andre Vieira's message of "Wed, 16 Mar 2022 14:46:14 +0000") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-10.8 required=5.0 tests=BAYES_00, BODY_8BITS, GIT_PATCH_0, KAM_DMARC_STATUS, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 16 Mar 2022 16:09:59 -0000 "Andre Vieira (lists)" writes: > Hi, > > This patch adds tuning structures for Neoverse N2. > > 2022-03-16=C2=A0 Tamar Christina=C2=A0 > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0=C2=A0 Andre Vieira > > =C2=A0=C2=A0=C2=A0 * config/aarch64/aarch64.cc (neoversen2_addrcost_tabl= e,=20 > neoversen2_regmove_cost, > =C2=A0=C2=A0=C2=A0 neoversen2_advsimd_vector_cost, neoversen2_sve_vector= _cost,=20 > neoversen2_scalar_issue_info, > =C2=A0=C2=A0=C2=A0 neoversen2_advsimd_issue_info, neoversen2_sve_issue_i= nfo,=20 > neoversen2_vec_issue_info, > =C2=A0=C2=A0=C2=A0 neoversen2_tunings): New structs. > =C2=A0=C2=A0=C2=A0 (neoversen2_tunings): Use new structs and update tuni= ng flags. > =C2=A0=C2=A0=C2=A0 (aarch64_vec_op_count::rename_cycles_per_iter): Enabl= e for=20 > neoversen2 tuning. > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index d504fe7607b66a9c9ed9b183a2d3c03d34fb0f80..e0bb447beb9eae74551d86350= 5eb265737d36334 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -519,6 +519,24 @@ static const struct cpu_addrcost_table neoversev1_ad= drcost_table =3D > 0 /* imm_offset */ > }; >=20=20 > +static const struct cpu_addrcost_table neoversen2_addrcost_table =3D > +{ > + { > + 1, /* hi */ > + 0, /* si */ > + 0, /* di */ > + 1, /* ti */ > + }, > + 0, /* pre_modify */ > + 0, /* post_modify */ > + 2, /* post_modify_ld3_st3 */ > + 2, /* post_modify_ld4_st4 */ > + 0, /* register_offset */ > + 0, /* register_sextend */ > + 0, /* register_zextend */ > + 0 /* imm_offset */ > +}; > + > static const struct cpu_regmove_cost generic_regmove_cost =3D > { > 1, /* GP2GP */ > @@ -624,6 +642,16 @@ static const struct cpu_regmove_cost a64fx_regmove_c= ost =3D > 2 /* FP2FP */ > }; >=20=20 > +static const struct cpu_regmove_cost neoversen2_regmove_cost =3D > +{ > + 1, /* GP2GP */ > + /* Spilling to int<->fp instead of memory is recommended so set > + realistic costs compared to memmv_cost. */ > + 3, /* GP2FP */ > + 2, /* FP2GP */ > + 2 /* FP2FP */ > +}; > + > /* Generic costs for Advanced SIMD vector operations. */ > static const advsimd_vec_cost generic_advsimd_vector_cost =3D > { > @@ -2174,12 +2202,166 @@ static const struct tune_params neoverse512tvb_t= unings =3D > &generic_prefetch_tune > }; >=20=20 > +static const advsimd_vec_cost neoversen2_advsimd_vector_cost =3D > +{ > + 2, /* int_stmt_cost */ > + 2, /* fp_stmt_cost */ > + 2, /* ld2_st2_permute_cost */ > + 2, /* ld3_st3_permute_cost */ > + 3, /* ld4_st4_permute_cost */ > + 3, /* permute_cost */ > + 4, /* reduc_i8_cost */ > + 4, /* reduc_i16_cost */ > + 2, /* reduc_i32_cost */ > + 2, /* reduc_i64_cost */ > + 6, /* reduc_f16_cost */ > + 4, /* reduc_f32_cost */ > + 2, /* reduc_f64_cost */ > + 2, /* store_elt_extra_cost */ > + /* This value is just inherited from the Cortex-A57 table. */ > + 8, /* vec_to_scalar_cost */ > + /* This depends very much on what the scalar value is and > + where it comes from. E.g. some constants take two dependent > + instructions or a load, while others might be moved from a GPR. > + 4 seems to be a reasonable compromise in practice. */ > + 4, /* scalar_to_vec_cost */ > + 4, /* align_load_cost */ > + 4, /* unalign_load_cost */ > + /* Although stores have a latency of 2 and compete for the > + vector pipes, in practice it's better not to model that. */ > + 1, /* unalign_store_cost */ > + 1 /* store_cost */ > +}; > + > +static const sve_vec_cost neoversen2_sve_vector_cost =3D > +{ > + { > + 2, /* int_stmt_cost */ > + 2, /* fp_stmt_cost */ > + 3, /* ld2_st2_permute_cost */ > + 4, /* ld3_st3_permute_cost */ > + 4, /* ld4_st4_permute_cost */ > + 3, /* permute_cost */ > + /* Theoretically, a reduction involving 15 scalar ADDs could > + complete in ~5 cycles and would have a cost of 15. [SU]ADDV > + completes in 11 cycles, so give it a cost of 15 + 6. */ > + 21, /* reduc_i8_cost */ > + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */ > + 13, /* reduc_i16_cost */ > + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */ > + 9, /* reduc_i32_cost */ > + /* Likewise for 1 scalar ADDs (~1 cycles) vs. 2: 1 + 1. */ typo: 1 scalar ADD. > + 2, /* reduc_i64_cost */ > + /* Theoretically, a reduction involving 7 scalar FADDs could > + complete in ~8 cycles and would have a cost of 14. FADDV > + completes in 6 cycles, so give it a cost of 14 - 2. */ > + 12, /* reduc_f16_cost */ > + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */ > + 6, /* reduc_f32_cost */ > + /* Likewise for 1 scalar FADDs (~2 cycles) vs. 2: 2 - 0. */ Similarly here. OK with those changes, thanks. Richard > + 2, /* reduc_f64_cost */ > + 2, /* store_elt_extra_cost */ > + /* This value is just inherited from the Cortex-A57 table. */ > + 8, /* vec_to_scalar_cost */ > + /* See the comment above the Advanced SIMD versions. */ > + 4, /* scalar_to_vec_cost */ > + 4, /* align_load_cost */ > + 4, /* unalign_load_cost */ > + /* Although stores have a latency of 2 and compete for the > + vector pipes, in practice it's better not to model that. */ > + 1, /* unalign_store_cost */ > + 1 /* store_cost */ > + }, > + 3, /* clast_cost */ > + 10, /* fadda_f16_cost */ > + 6, /* fadda_f32_cost */ > + 4, /* fadda_f64_cost */ > + /* A strided Advanced SIMD x64 load would take two parallel FP loads > + (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather > + is 1 cycle more. The Advanced SIMD version is costed as 2 scalar l= oads > + (cost 8) and a vec_construct (cost 2). Add a full vector operation > + (cost 2) to that, to avoid the difference being lost in rounding. > + > + There is no easy comparison between a strided Advanced SIMD x32 load > + and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector > + operation more than a 64-bit gather. */ > + 14, /* gather_load_x32_cost */ > + 12, /* gather_load_x64_cost */ > + 3 /* scatter_store_elt_cost */ > +}; > + > +static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info = =3D > +{ > + 3, /* loads_stores_per_cycle */ > + 2, /* stores_per_cycle */ > + 4, /* general_ops_per_cycle */ > + 0, /* fp_simd_load_general_ops */ > + 1 /* fp_simd_store_general_ops */ > +}; > + > +static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_inf= o =3D > +{ > + { > + 3, /* loads_stores_per_cycle */ > + 2, /* stores_per_cycle */ > + 2, /* general_ops_per_cycle */ > + 0, /* fp_simd_load_general_ops */ > + 1 /* fp_simd_store_general_ops */ > + }, > + 2, /* ld2_st2_general_ops */ > + 2, /* ld3_st3_general_ops */ > + 3 /* ld4_st4_general_ops */ > +}; > + > +static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =3D > +{ > + { > + { > + 3, /* loads_per_cycle */ > + 2, /* stores_per_cycle */ > + 2, /* general_ops_per_cycle */ > + 0, /* fp_simd_load_general_ops */ > + 1 /* fp_simd_store_general_ops */ > + }, > + 2, /* ld2_st2_general_ops */ > + 3, /* ld3_st3_general_ops */ > + 3 /* ld4_st4_general_ops */ > + }, > + 2, /* pred_ops_per_cycle */ > + 2, /* while_pred_ops */ > + 2, /* int_cmp_pred_ops */ > + 1, /* fp_cmp_pred_ops */ > + 1, /* gather_scatter_pair_general_ops */ > + 1 /* gather_scatter_pair_pred_ops */ > +}; > + > +static const aarch64_vec_issue_info neoversen2_vec_issue_info =3D > +{ > + &neoversen2_scalar_issue_info, > + &neoversen2_advsimd_issue_info, > + &neoversen2_sve_issue_info > +}; > + > +/* Neoverse N2 costs for vector insn classes. */ > +static const struct cpu_vector_cost neoversen2_vector_cost =3D > +{ > + 1, /* scalar_int_stmt_cost */ > + 2, /* scalar_fp_stmt_cost */ > + 4, /* scalar_load_cost */ > + 1, /* scalar_store_cost */ > + 1, /* cond_taken_branch_cost */ > + 1, /* cond_not_taken_branch_cost */ > + &neoversen2_advsimd_vector_cost, /* advsimd */ > + &neoversen2_sve_vector_cost, /* sve */ > + &neoversen2_vec_issue_info /* issue_info */ > +}; > + > static const struct tune_params neoversen2_tunings =3D > { > &cortexa76_extra_costs, > - &generic_addrcost_table, > - &generic_regmove_cost, > - &cortexa57_vector_cost, > + &neoversen2_addrcost_table, > + &neoversen2_regmove_cost, > + &neoversen2_vector_cost, > &generic_branch_cost, > &generic_approx_modes, > SVE_128, /* sve_width */ > @@ -2202,7 +2384,10 @@ static const struct tune_params neoversen2_tunings= =3D > 2, /* min_div_recip_mul_df. */ > 0, /* max_case_values. */ > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND > + | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ > &generic_prefetch_tune > }; >=20=20 > @@ -15131,7 +15316,8 @@ aarch64_vec_op_count::sve_issue_info () const > fractional_cost > aarch64_vec_op_count::rename_cycles_per_iter () const > { > - if (sve_issue_info () =3D=3D &neoverse512tvb_sve_issue_info) > + if (sve_issue_info () =3D=3D &neoverse512tvb_sve_issue_info > + || sve_issue_info () =3D=3D &neoversen2_sve_issue_info) > /* + 1 for an addition. We've already counted a general op for each > store, so we don't need to account for stores separately. The br= anch > reads no registers and so does not need to be counted either.