Hi All, This patch adds a new generic scheduling model "generic-armv8-a" and makes it the default for all Armv8 architectures. -mcpu=generic and -mtune=generic is kept around for those that really want the deprecated cost model. This shows on SPECCPU 2017 the following: generic: SPECINT 1.0% imporvement in geomean, SPECFP -0.6%. The SPECFP is due to fotonik3d_r where we vectorize an FP calculation that only ever needs one lane of the result. This I believe is a generic costing bug but at the moment we can't change costs of FP and INT independently. So will defer updating that cost to stage3 after Richard's other costing updates land. generic SVE: SPECINT 1.1% improvement in geomean, SPECFP 0.7% improvement. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR target/111370 * config/aarch64/aarch64-arches.def (armv8-9, armv8-a, armv8.1-a, armv8.2-a, armv8.3-a, armv8.4-a, armv8.5-a, armv8.6-a, armv8.7-a, armv8.8-a): Update to generic_armv8_a. * config/aarch64/aarch64-cores.def (generic-armv8-a): New. * config/aarch64/aarch64-tune.md: Regenerate. * config/aarch64/aarch64.cc: Include generic_armv8_a.h * config/aarch64/aarch64.h (TARGET_CPU_DEFAULT): Change to TARGET_CPU_generic_armv8_a. * config/aarch64/tuning_models/generic_armv8_a.h: New file. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def index 7ae92aa8e984e0a77efd5c5a5061c4c6f86e0118..f89e4ea1f48acc2875c9a834d93d94c94163cddc 100644 --- a/gcc/config/aarch64/aarch64-arches.def +++ b/gcc/config/aarch64/aarch64-arches.def @@ -30,19 +30,19 @@ Due to the assumptions about the positions of these fields in config.gcc, NAME should be kept as the first argument. */ -AARCH64_ARCH("armv8-a", generic, V8A, 8, (SIMD)) -AARCH64_ARCH("armv8.1-a", generic, V8_1A, 8, (V8A, LSE, CRC, RDMA)) -AARCH64_ARCH("armv8.2-a", generic, V8_2A, 8, (V8_1A)) -AARCH64_ARCH("armv8.3-a", generic, V8_3A, 8, (V8_2A, PAUTH, RCPC)) -AARCH64_ARCH("armv8.4-a", generic, V8_4A, 8, (V8_3A, F16FML, DOTPROD, FLAGM)) -AARCH64_ARCH("armv8.5-a", generic, V8_5A, 8, (V8_4A, SB, SSBS, PREDRES)) -AARCH64_ARCH("armv8.6-a", generic, V8_6A, 8, (V8_5A, I8MM, BF16)) -AARCH64_ARCH("armv8.7-a", generic, V8_7A, 8, (V8_6A, LS64)) -AARCH64_ARCH("armv8.8-a", generic, V8_8A, 8, (V8_7A, MOPS)) -AARCH64_ARCH("armv8-r", generic, V8R , 8, (V8_4A)) -AARCH64_ARCH("armv9-a", generic, V9A , 9, (V8_5A, SVE2)) -AARCH64_ARCH("armv9.1-a", generic, V9_1A, 9, (V8_6A, V9A)) -AARCH64_ARCH("armv9.2-a", generic, V9_2A, 9, (V8_7A, V9_1A)) -AARCH64_ARCH("armv9.3-a", generic, V9_3A, 9, (V8_8A, V9_2A)) +AARCH64_ARCH("armv8-a", generic_armv8_a, V8A, 8, (SIMD)) +AARCH64_ARCH("armv8.1-a", generic_armv8_a, V8_1A, 8, (V8A, LSE, CRC, RDMA)) +AARCH64_ARCH("armv8.2-a", generic_armv8_a, V8_2A, 8, (V8_1A)) +AARCH64_ARCH("armv8.3-a", generic_armv8_a, V8_3A, 8, (V8_2A, PAUTH, RCPC)) +AARCH64_ARCH("armv8.4-a", generic_armv8_a, V8_4A, 8, (V8_3A, F16FML, DOTPROD, FLAGM)) +AARCH64_ARCH("armv8.5-a", generic_armv8_a, V8_5A, 8, (V8_4A, SB, SSBS, PREDRES)) +AARCH64_ARCH("armv8.6-a", generic_armv8_a, V8_6A, 8, (V8_5A, I8MM, BF16)) +AARCH64_ARCH("armv8.7-a", generic_armv8_a, V8_7A, 8, (V8_6A, LS64)) +AARCH64_ARCH("armv8.8-a", generic_armv8_a, V8_8A, 8, (V8_7A, MOPS)) +AARCH64_ARCH("armv8-r", generic_armv8_a, V8R , 8, (V8_4A)) +AARCH64_ARCH("armv9-a", generic, V9A , 9, (V8_5A, SVE2)) +AARCH64_ARCH("armv9.1-a", generic, V9_1A, 9, (V8_6A, V9A)) +AARCH64_ARCH("armv9.2-a", generic, V9_2A, 9, (V8_7A, V9_1A)) +AARCH64_ARCH("armv9.3-a", generic, V9_3A, 9, (V8_8A, V9_2A)) #undef AARCH64_ARCH diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index 3e363bd0e8bbc10cb5b28d6183647736318e6d40..30f4dd04ed71823bc34c0c405d49963b6b2d1375 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -191,5 +191,6 @@ AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, /* Generic Architecture Processors. */ AARCH64_CORE("generic", generic, cortexa53, V8A, (), generic, 0x0, 0x0, -1) +AARCH64_CORE("generic-armv8-a", generic_armv8_a, cortexa53, V8A, (), generic_armv8_a, 0x0, 0x0, -1) #undef AARCH64_CORE diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index cd5d79ea9c221874578a4d5804e4f618e671ebcd..0a32056f255de455f47a0b7395dfef0af84c6b5e 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 145bf536c28fdef84246e16d8351f4b4e357d27c..1ac298926ce1606a87bcdcaf691f182ca416d600 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -724,7 +724,7 @@ enum target_cpus /* If there is no CPU defined at configure, use generic as default. */ #ifndef TARGET_CPU_DEFAULT -# define TARGET_CPU_DEFAULT TARGET_CPU_generic +# define TARGET_CPU_DEFAULT TARGET_CPU_generic_armv8_a #endif /* If inserting NOP before a mult-accumulate insn remember to adjust the diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9d59431d933021d71c5c202f0a61f807a2d2b0f1..1f5645e4886acd30ee5a437f60ffb53ee7b09436 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -355,6 +355,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = /* Tuning parameters. */ #include "tuning_models/generic.h" +#include "tuning_models/generic_armv8_a.h" #include "tuning_models/cortexa35.h" #include "tuning_models/cortexa53.h" #include "tuning_models/cortexa57.h" diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h b/gcc/config/aarch64/tuning_models/generic_armv8_a.h new file mode 100644 index 0000000000000000000000000000000000000000..82abe172834756696a3905dbf92464f73a1ea3da --- /dev/null +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h @@ -0,0 +1,191 @@ +/* Tuning model description for AArch64 architecture. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef GCC_AARCH64_H_GENERIC_ARMV8_A +#define GCC_AARCH64_H_GENERIC_ARMV8_A + +#include "generic.h" + +static const struct cpu_addrcost_table generic_armv8_a_addrcost_table = +{ + { + 1, /* hi */ + 0, /* si */ + 0, /* di */ + 1, /* ti */ + }, + 0, /* pre_modify */ + 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ + 0, /* register_offset */ + 0, /* register_sextend */ + 0, /* register_zextend */ + 0 /* imm_offset */ +}; + +static const struct cpu_regmove_cost generic_armv8_a_regmove_cost = +{ + 1, /* GP2GP */ + /* Avoid the use of slow int<->fp moves for spilling by setting + their cost higher than memmov_cost. */ + 5, /* GP2FP */ + 5, /* FP2GP */ + 2 /* FP2FP */ +}; + +/* Generic costs for Advanced SIMD vector operations. */ +static const advsimd_vec_cost generic_armv8_a_advsimd_vector_cost = +{ + 1, /* int_stmt_cost */ + 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + 2, /* vec_to_scalar_cost */ + 1, /* scalar_to_vec_cost */ + 1, /* align_load_cost */ + 1, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +/* Generic costs for SVE vector operations. */ +static const sve_vec_cost generic_armv8_a_sve_vector_cost = +{ + { + 1, /* int_stmt_cost */ + 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + 2, /* vec_to_scalar_cost */ + 1, /* scalar_to_vec_cost */ + 1, /* align_load_cost */ + 1, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 2, /* clast_cost */ + 2, /* fadda_f16_cost */ + 2, /* fadda_f32_cost */ + 2, /* fadda_f64_cost */ + 4, /* gather_load_x32_cost */ + 2, /* gather_load_x64_cost */ + 1 /* scatter_store_elt_cost */ +}; + +/* Generic costs for vector insn classes. */ +static const struct cpu_vector_cost generic_armv8_a_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ + 1, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 3, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &generic_armv8_a_advsimd_vector_cost, /* advsimd */ + &generic_armv8_a_sve_vector_cost, /* sve */ + nullptr /* issue_info */ +}; + +/* Generic costs for branch instructions. */ +static const struct cpu_branch_cost generic_armv8_a_branch_cost = +{ + 1, /* Predictable. */ + 3 /* Unpredictable. */ +}; + +/* Generic approximation modes. */ +static const cpu_approx_modes generic_armv8_a_approx_modes = +{ + AARCH64_APPROX_NONE, /* division */ + AARCH64_APPROX_NONE, /* sqrt */ + AARCH64_APPROX_NONE /* recip_sqrt */ +}; + +/* Generic prefetch settings (which disable prefetch). */ +static const cpu_prefetch_tune generic_armv8_a_prefetch_tune = +{ + 0, /* num_slots */ + -1, /* l1_cache_size */ + -1, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + true, /* prefetch_dynamic_strides */ + -1, /* minimum_stride */ + -1 /* default_opt_level */ +}; + +static const struct tune_params generic_armv8_a_tunings = +{ + &cortexa76_extra_costs, + &generic_armv8_a_addrcost_table, + &generic_armv8_a_regmove_cost, + &generic_armv8_a_vector_cost, + &generic_armv8_a_branch_cost, + &generic_armv8_a_approx_modes, + SVE_NOT_IMPLEMENTED, /* sve_width */ + { 4, /* load_int. */ + 2, /* store_int. */ + 5, /* load_fp. */ + 2, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ + 3, /* issue_rate */ + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ + &generic_prefetch_tune, + AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ + AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ +}; + +#endif /* GCC_AARCH64_H_GENERIC_ARMV8_A. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c index aac06bd8093bed9e50928ee23f9a075888f14543..96e9935360100e25a4c01cceabc7aa840f520a3e 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256" } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256 --param=aarch64-autovec-preference=2" } */ #include diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c index f6278916e1afeb3f0cb8fdbff4e98782ad0a726e..6f969a829425960b414508a7e354a1f39426a0e4 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256" } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256 --param=aarch64-autovec-preference=2" } */ #include diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_5.c index 03a6636f2d20b12f7e950a5bd6e43216139370fa..e6ec5157cd6dcc6b6dc24c5384432289b6dcdfba 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_5.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256" } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256 --param=aarch64-autovec-preference=2" } */ #include diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_5.c index 9a2bd8f152ff32e8da1c4e2a73a31a249e5991c7..7ed35921b6f914441dc463c4030fcc4663a6813c 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_5.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256" } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=256 --param=aarch64-autovec-preference=2" } */ #include diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_13.c b/gcc/testsuite/gcc.target/aarch64/target_attr_13.c index d5bee3a7b900bf9348c9cbfd67f487c381b13bf6..4bdb167944cda1861dd0462d905149646be69693 100644 --- a/gcc/testsuite/gcc.target/aarch64/target_attr_13.c +++ b/gcc/testsuite/gcc.target/aarch64/target_attr_13.c @@ -1,5 +1,5 @@ /* { dg-do assemble } */ -/* { dg-options "-O2 -march=armv8-a+crc+crypto -mcpu=generic" } */ +/* { dg-options "-O2 -mcpu=generic+crypto" } */ #include "arm_acle.h" diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_15.c b/gcc/testsuite/gcc.target/aarch64/target_attr_15.c index 069a0010865334324a100bab358bb53369f122fb..e6f31ba72ee77d1129f3cfbe2d90216d6c355c57 100644 --- a/gcc/testsuite/gcc.target/aarch64/target_attr_15.c +++ b/gcc/testsuite/gcc.target/aarch64/target_attr_15.c @@ -1,5 +1,5 @@ /* { dg-do assemble } */ -/* { dg-options "-march=armv8-a+crypto -mcpu=generic -save-temps" } */ +/* { dg-options "-mcpu=generic+crypto -save-temps" } */ /* Check that "+nothing" clears the ISA flags. */ --