diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9fbfc548a891f5d11940c6fd3c49a14bfbdec886..07b1cde39209f5c7740e336b499e9aed31e4c515 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -354,2405 +354,30 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
};
/* Tuning parameters. */
-
-static const struct cpu_addrcost_table generic_addrcost_table =
-{
- {
- 1, /* hi */
- 0, /* si */
- 0, /* di */
- 1, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 0, /* post_modify_ld3_st3 */
- 0, /* post_modify_ld4_st4 */
- 0, /* register_offset */
- 0, /* register_sextend */
- 0, /* register_zextend */
- 0 /* imm_offset */
-};
-
-static const struct cpu_addrcost_table exynosm1_addrcost_table =
-{
- {
- 0, /* hi */
- 0, /* si */
- 0, /* di */
- 2, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 0, /* post_modify_ld3_st3 */
- 0, /* post_modify_ld4_st4 */
- 1, /* register_offset */
- 1, /* register_sextend */
- 2, /* register_zextend */
- 0, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table xgene1_addrcost_table =
-{
- {
- 1, /* hi */
- 0, /* si */
- 0, /* di */
- 1, /* ti */
- },
- 1, /* pre_modify */
- 1, /* post_modify */
- 1, /* post_modify_ld3_st3 */
- 1, /* post_modify_ld4_st4 */
- 0, /* register_offset */
- 1, /* register_sextend */
- 1, /* register_zextend */
- 0, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
-{
- {
- 1, /* hi */
- 1, /* si */
- 1, /* di */
- 2, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 0, /* post_modify_ld3_st3 */
- 0, /* post_modify_ld4_st4 */
- 2, /* register_offset */
- 3, /* register_sextend */
- 3, /* register_zextend */
- 0, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
-{
- {
- 1, /* hi */
- 1, /* si */
- 1, /* di */
- 2, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 0, /* post_modify_ld3_st3 */
- 0, /* post_modify_ld4_st4 */
- 2, /* register_offset */
- 3, /* register_sextend */
- 3, /* register_zextend */
- 0, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table tsv110_addrcost_table =
-{
- {
- 1, /* hi */
- 0, /* si */
- 0, /* di */
- 1, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 0, /* post_modify_ld3_st3 */
- 0, /* post_modify_ld4_st4 */
- 0, /* register_offset */
- 1, /* register_sextend */
- 1, /* register_zextend */
- 0, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table qdf24xx_addrcost_table =
-{
- {
- 1, /* hi */
- 1, /* si */
- 1, /* di */
- 2, /* ti */
- },
- 1, /* pre_modify */
- 1, /* post_modify */
- 1, /* post_modify_ld3_st3 */
- 1, /* post_modify_ld4_st4 */
- 3, /* register_offset */
- 3, /* register_sextend */
- 3, /* register_zextend */
- 2, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table a64fx_addrcost_table =
-{
- {
- 1, /* hi */
- 1, /* si */
- 1, /* di */
- 2, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 0, /* post_modify_ld3_st3 */
- 0, /* post_modify_ld4_st4 */
- 2, /* register_offset */
- 3, /* register_sextend */
- 3, /* register_zextend */
- 0, /* imm_offset */
-};
-
-static const struct cpu_addrcost_table neoversev1_addrcost_table =
-{
- {
- 1, /* hi */
- 0, /* si */
- 0, /* di */
- 1, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 3, /* post_modify_ld3_st3 */
- 3, /* post_modify_ld4_st4 */
- 0, /* register_offset */
- 0, /* register_sextend */
- 0, /* register_zextend */
- 0 /* imm_offset */
-};
-
-static const struct cpu_addrcost_table neoversen2_addrcost_table =
-{
- {
- 1, /* hi */
- 0, /* si */
- 0, /* di */
- 1, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 2, /* post_modify_ld3_st3 */
- 2, /* post_modify_ld4_st4 */
- 0, /* register_offset */
- 0, /* register_sextend */
- 0, /* register_zextend */
- 0 /* imm_offset */
-};
-
-static const struct cpu_addrcost_table neoversev2_addrcost_table =
-{
- {
- 1, /* hi */
- 0, /* si */
- 0, /* di */
- 1, /* ti */
- },
- 0, /* pre_modify */
- 0, /* post_modify */
- 2, /* post_modify_ld3_st3 */
- 2, /* post_modify_ld4_st4 */
- 0, /* register_offset */
- 0, /* register_sextend */
- 0, /* register_zextend */
- 0 /* imm_offset */
-};
-
-static const struct cpu_regmove_cost generic_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost. */
- 5, /* GP2FP */
- 5, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost cortexa57_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost. */
- 5, /* GP2FP */
- 5, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost cortexa53_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost. */
- 5, /* GP2FP */
- 5, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost exynosm1_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost (actual, 4 and 9). */
- 9, /* GP2FP */
- 9, /* FP2GP */
- 1 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost thunderx_regmove_cost =
-{
- 2, /* GP2GP */
- 2, /* GP2FP */
- 6, /* FP2GP */
- 4 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost xgene1_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost. */
- 8, /* GP2FP */
- 8, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost qdf24xx_regmove_cost =
-{
- 2, /* GP2GP */
- /* Avoid the use of int<->fp moves for spilling. */
- 6, /* GP2FP */
- 6, /* FP2GP */
- 4 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of int<->fp moves for spilling. */
- 5, /* GP2FP */
- 6, /* FP2GP */
- 3, /* FP2FP */
-};
-
-static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of int<->fp moves for spilling. */
- 4, /* GP2FP */
- 5, /* FP2GP */
- 4 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost tsv110_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost. */
- 2, /* GP2FP */
- 3, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost a64fx_regmove_cost =
-{
- 1, /* GP2GP */
- /* Avoid the use of slow int<->fp moves for spilling by setting
- their cost higher than memmov_cost. */
- 5, /* GP2FP */
- 7, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost neoversen2_regmove_cost =
-{
- 1, /* GP2GP */
- /* Spilling to int<->fp instead of memory is recommended so set
- realistic costs compared to memmov_cost. */
- 3, /* GP2FP */
- 2, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost neoversev1_regmove_cost =
-{
- 1, /* GP2GP */
- /* Spilling to int<->fp instead of memory is recommended so set
- realistic costs compared to memmov_cost. */
- 3, /* GP2FP */
- 2, /* FP2GP */
- 2 /* FP2FP */
-};
-
-static const struct cpu_regmove_cost neoversev2_regmove_cost =
-{
- 1, /* GP2GP */
- /* Spilling to int<->fp instead of memory is recommended so set
- realistic costs compared to memmov_cost. */
- 3, /* GP2FP */
- 2, /* FP2GP */
- 2 /* FP2FP */
-};
-
-/* Generic costs for Advanced SIMD vector operations. */
-static const advsimd_vec_cost generic_advsimd_vector_cost =
-{
- 1, /* int_stmt_cost */
- 1, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 2, /* permute_cost */
- 2, /* reduc_i8_cost */
- 2, /* reduc_i16_cost */
- 2, /* reduc_i32_cost */
- 2, /* reduc_i64_cost */
- 2, /* reduc_f16_cost */
- 2, /* reduc_f32_cost */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- 2, /* vec_to_scalar_cost */
- 1, /* scalar_to_vec_cost */
- 1, /* align_load_cost */
- 1, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-/* Generic costs for SVE vector operations. */
-static const sve_vec_cost generic_sve_vector_cost =
-{
- {
- 1, /* int_stmt_cost */
- 1, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 2, /* permute_cost */
- 2, /* reduc_i8_cost */
- 2, /* reduc_i16_cost */
- 2, /* reduc_i32_cost */
- 2, /* reduc_i64_cost */
- 2, /* reduc_f16_cost */
- 2, /* reduc_f32_cost */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- 2, /* vec_to_scalar_cost */
- 1, /* scalar_to_vec_cost */
- 1, /* align_load_cost */
- 1, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
- },
- 2, /* clast_cost */
- 2, /* fadda_f16_cost */
- 2, /* fadda_f32_cost */
- 2, /* fadda_f64_cost */
- 4, /* gather_load_x32_cost */
- 2, /* gather_load_x64_cost */
- 1 /* scatter_store_elt_cost */
-};
-
-/* Generic costs for vector insn classes. */
-static const struct cpu_vector_cost generic_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 1, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 3, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &generic_advsimd_vector_cost, /* advsimd */
- &generic_sve_vector_cost, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost a64fx_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 5, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 13, /* reduc_i8_cost */
- 13, /* reduc_i16_cost */
- 13, /* reduc_i32_cost */
- 13, /* reduc_i64_cost */
- 13, /* reduc_f16_cost */
- 13, /* reduc_f32_cost */
- 13, /* reduc_f64_cost */
- 13, /* store_elt_extra_cost */
- 13, /* vec_to_scalar_cost */
- 4, /* scalar_to_vec_cost */
- 6, /* align_load_cost */
- 6, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-static const sve_vec_cost a64fx_sve_vector_cost =
-{
- {
- 2, /* int_stmt_cost */
- 5, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 13, /* reduc_i8_cost */
- 13, /* reduc_i16_cost */
- 13, /* reduc_i32_cost */
- 13, /* reduc_i64_cost */
- 13, /* reduc_f16_cost */
- 13, /* reduc_f32_cost */
- 13, /* reduc_f64_cost */
- 13, /* store_elt_extra_cost */
- 13, /* vec_to_scalar_cost */
- 4, /* scalar_to_vec_cost */
- 6, /* align_load_cost */
- 6, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
- },
- 13, /* clast_cost */
- 13, /* fadda_f16_cost */
- 13, /* fadda_f32_cost */
- 13, /* fadda_f64_cost */
- 64, /* gather_load_x32_cost */
- 32, /* gather_load_x64_cost */
- 1 /* scatter_store_elt_cost */
-};
-
-static const struct cpu_vector_cost a64fx_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 5, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 3, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &a64fx_advsimd_vector_cost, /* advsimd */
- &a64fx_sve_vector_cost, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
-{
- 1, /* int_stmt_cost */
- 3, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 2, /* permute_cost */
- 1, /* reduc_i8_cost */
- 1, /* reduc_i16_cost */
- 1, /* reduc_i32_cost */
- 1, /* reduc_i64_cost */
- 1, /* reduc_f16_cost */
- 1, /* reduc_f32_cost */
- 1, /* reduc_f64_cost */
- 1, /* store_elt_extra_cost */
- 1, /* vec_to_scalar_cost */
- 1, /* scalar_to_vec_cost */
- 1, /* align_load_cost */
- 1, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-/* QDF24XX costs for vector insn classes. */
-static const struct cpu_vector_cost qdf24xx_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 1, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 3, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &qdf24xx_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-
-static const advsimd_vec_cost thunderx_advsimd_vector_cost =
-{
- 4, /* int_stmt_cost */
- 1, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 4, /* permute_cost */
- 2, /* reduc_i8_cost */
- 2, /* reduc_i16_cost */
- 2, /* reduc_i32_cost */
- 2, /* reduc_i64_cost */
- 2, /* reduc_f16_cost */
- 2, /* reduc_f32_cost */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- 2, /* vec_to_scalar_cost */
- 2, /* scalar_to_vec_cost */
- 3, /* align_load_cost */
- 5, /* unalign_load_cost */
- 5, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-/* ThunderX costs for vector insn classes. */
-static const struct cpu_vector_cost thunderx_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 3, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 3, /* cond_taken_branch_cost */
- 3, /* cond_not_taken_branch_cost */
- &thunderx_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost tsv110_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 2, /* permute_cost */
- 3, /* reduc_i8_cost */
- 3, /* reduc_i16_cost */
- 3, /* reduc_i32_cost */
- 3, /* reduc_i64_cost */
- 3, /* reduc_f16_cost */
- 3, /* reduc_f32_cost */
- 3, /* reduc_f64_cost */
- 3, /* store_elt_extra_cost */
- 3, /* vec_to_scalar_cost */
- 2, /* scalar_to_vec_cost */
- 5, /* align_load_cost */
- 5, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-static const struct cpu_vector_cost tsv110_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 5, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &tsv110_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 8, /* reduc_i8_cost */
- 8, /* reduc_i16_cost */
- 8, /* reduc_i32_cost */
- 8, /* reduc_i64_cost */
- 8, /* reduc_f16_cost */
- 8, /* reduc_f32_cost */
- 8, /* reduc_f64_cost */
- 8, /* store_elt_extra_cost */
- 8, /* vec_to_scalar_cost */
- 8, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-/* Cortex-A57 costs for vector insn classes. */
-static const struct cpu_vector_cost cortexa57_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &cortexa57_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
-{
- 3, /* int_stmt_cost */
- 3, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 3, /* reduc_i8_cost */
- 3, /* reduc_i16_cost */
- 3, /* reduc_i32_cost */
- 3, /* reduc_i64_cost */
- 3, /* reduc_f16_cost */
- 3, /* reduc_f32_cost */
- 3, /* reduc_f64_cost */
- 3, /* store_elt_extra_cost */
- 3, /* vec_to_scalar_cost */
- 3, /* scalar_to_vec_cost */
- 5, /* align_load_cost */
- 5, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-static const struct cpu_vector_cost exynosm1_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 5, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &exynosm1_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost xgene1_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 2, /* permute_cost */
- 4, /* reduc_i8_cost */
- 4, /* reduc_i16_cost */
- 4, /* reduc_i32_cost */
- 4, /* reduc_i64_cost */
- 4, /* reduc_f16_cost */
- 4, /* reduc_f32_cost */
- 4, /* reduc_f64_cost */
- 4, /* store_elt_extra_cost */
- 4, /* vec_to_scalar_cost */
- 4, /* scalar_to_vec_cost */
- 10, /* align_load_cost */
- 10, /* unalign_load_cost */
- 2, /* unalign_store_cost */
- 2 /* store_cost */
-};
-
-/* Generic costs for vector insn classes. */
-static const struct cpu_vector_cost xgene1_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 1, /* scalar_fp_stmt_cost */
- 5, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 2, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &xgene1_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
-{
- 4, /* int_stmt_cost */
- 5, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 10, /* permute_cost */
- 6, /* reduc_i8_cost */
- 6, /* reduc_i16_cost */
- 6, /* reduc_i32_cost */
- 6, /* reduc_i64_cost */
- 6, /* reduc_f16_cost */
- 6, /* reduc_f32_cost */
- 6, /* reduc_f64_cost */
- 6, /* store_elt_extra_cost */
- 6, /* vec_to_scalar_cost */
- 5, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-/* Costs for vector insn classes for Vulcan. */
-static const struct cpu_vector_cost thunderx2t99_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 6, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 2, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &thunderx2t99_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
-{
- 5, /* int_stmt_cost */
- 5, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 10, /* permute_cost */
- 5, /* reduc_i8_cost */
- 5, /* reduc_i16_cost */
- 5, /* reduc_i32_cost */
- 5, /* reduc_i64_cost */
- 5, /* reduc_f16_cost */
- 5, /* reduc_f32_cost */
- 5, /* reduc_f64_cost */
- 5, /* store_elt_extra_cost */
- 5, /* vec_to_scalar_cost */
- 5, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- 4, /* unalign_store_cost */
- 4 /* store_cost */
-};
-
-static const struct cpu_vector_cost thunderx3t110_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 5, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 2, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &thunderx3t110_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-static const advsimd_vec_cost ampere1_advsimd_vector_cost =
-{
- 1, /* int_stmt_cost */
- 3, /* fp_stmt_cost */
- 0, /* ld2_st2_permute_cost */
- 0, /* ld3_st3_permute_cost */
- 0, /* ld4_st4_permute_cost */
- 2, /* permute_cost */
- 12, /* reduc_i8_cost */
- 9, /* reduc_i16_cost */
- 6, /* reduc_i32_cost */
- 5, /* reduc_i64_cost */
- 9, /* reduc_f16_cost */
- 6, /* reduc_f32_cost */
- 5, /* reduc_f64_cost */
- 8, /* store_elt_extra_cost */
- 6, /* vec_to_scalar_cost */
- 7, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-/* Ampere-1 costs for vector insn classes. */
-static const struct cpu_vector_cost ampere1_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 3, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &ere1_advsimd_vector_cost, /* advsimd */
- nullptr, /* sve */
- nullptr /* issue_info */
-};
-
-/* Generic costs for branch instructions. */
-static const struct cpu_branch_cost generic_branch_cost =
-{
- 1, /* Predictable. */
- 3 /* Unpredictable. */
-};
-
-/* Generic approximation modes. */
-static const cpu_approx_modes generic_approx_modes =
-{
- AARCH64_APPROX_NONE, /* division */
- AARCH64_APPROX_NONE, /* sqrt */
- AARCH64_APPROX_NONE /* recip_sqrt */
-};
-
-/* Approximation modes for Exynos M1. */
-static const cpu_approx_modes exynosm1_approx_modes =
-{
- AARCH64_APPROX_NONE, /* division */
- AARCH64_APPROX_ALL, /* sqrt */
- AARCH64_APPROX_ALL /* recip_sqrt */
-};
-
-/* Approximation modes for X-Gene 1. */
-static const cpu_approx_modes xgene1_approx_modes =
-{
- AARCH64_APPROX_NONE, /* division */
- AARCH64_APPROX_NONE, /* sqrt */
- AARCH64_APPROX_ALL /* recip_sqrt */
-};
-
-/* Generic prefetch settings (which disable prefetch). */
-static const cpu_prefetch_tune generic_prefetch_tune =
-{
- 0, /* num_slots */
- -1, /* l1_cache_size */
- -1, /* l1_cache_line_size */
- -1, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune exynosm1_prefetch_tune =
-{
- 0, /* num_slots */
- -1, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- -1, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune qdf24xx_prefetch_tune =
-{
- 4, /* num_slots */
- 32, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- 512, /* l2_cache_size */
- false, /* prefetch_dynamic_strides */
- 2048, /* minimum_stride */
- 3 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune thunderxt88_prefetch_tune =
-{
- 8, /* num_slots */
- 32, /* l1_cache_size */
- 128, /* l1_cache_line_size */
- 16*1024, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- 3 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune thunderx_prefetch_tune =
-{
- 8, /* num_slots */
- 32, /* l1_cache_size */
- 128, /* l1_cache_line_size */
- -1, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
-{
- 8, /* num_slots */
- 32, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- 256, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
-{
- 8, /* num_slots */
- 32, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- 256, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune tsv110_prefetch_tune =
-{
- 0, /* num_slots */
- 64, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- 512, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune xgene1_prefetch_tune =
-{
- 8, /* num_slots */
- 32, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- 256, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune a64fx_prefetch_tune =
-{
- 8, /* num_slots */
- 64, /* l1_cache_size */
- 256, /* l1_cache_line_size */
- 32768, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const cpu_prefetch_tune ampere1_prefetch_tune =
-{
- 0, /* num_slots */
- 64, /* l1_cache_size */
- 64, /* l1_cache_line_size */
- 2048, /* l2_cache_size */
- true, /* prefetch_dynamic_strides */
- -1, /* minimum_stride */
- -1 /* default_opt_level */
-};
-
-static const struct tune_params generic_tunings =
-{
- &cortexa57_extra_costs,
- &generic_addrcost_table,
- &generic_regmove_cost,
- &generic_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 2, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "16:12", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
- Neoverse V1. It does not have a noticeable effect on A64FX and should
- have at most a very minor effect on SVE2 cores. */
- (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params cortexa35_tunings =
-{
- &cortexa53_extra_costs,
- &generic_addrcost_table,
- &cortexa53_regmove_cost,
- &generic_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 1, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
- "16", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params cortexa53_tunings =
-{
- &cortexa53_extra_costs,
- &generic_addrcost_table,
- &cortexa53_regmove_cost,
- &generic_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 2, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
- "16", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params cortexa57_tunings =
-{
- &cortexa57_extra_costs,
- &generic_addrcost_table,
- &cortexa57_regmove_cost,
- &cortexa57_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
- "16", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params cortexa72_tunings =
-{
- &cortexa57_extra_costs,
- &generic_addrcost_table,
- &cortexa57_regmove_cost,
- &cortexa57_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
- "16", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params cortexa73_tunings =
-{
- &cortexa57_extra_costs,
- &generic_addrcost_table,
- &cortexa57_regmove_cost,
- &cortexa57_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 2, /* issue_rate. */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
- "16", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params exynosm1_tunings =
-{
- &exynosm1_extra_costs,
- &exynosm1_addrcost_table,
- &exynosm1_regmove_cost,
- &exynosm1_vector_cost,
- &generic_branch_cost,
- &exynosm1_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
- "4", /* function_align. */
- "4", /* jump_align. */
- "4", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 48, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &exynosm1_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params thunderxt88_tunings =
-{
- &thunderx_extra_costs,
- &generic_addrcost_table,
- &thunderx_regmove_cost,
- &thunderx_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 6, /* load_int. */
- 6, /* store_int. */
- 6, /* load_fp. */
- 6, /* store_fp. */
- 6, /* load_pred. */
- 6 /* store_pred. */
- }, /* memmov_cost. */
- 2, /* issue_rate */
- AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
- "8", /* function_align. */
- "8", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &thunderxt88_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
-};
-
-static const struct tune_params thunderx_tunings =
-{
- &thunderx_extra_costs,
- &generic_addrcost_table,
- &thunderx_regmove_cost,
- &thunderx_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 6, /* load_int. */
- 6, /* store_int. */
- 6, /* load_fp. */
- 6, /* store_fp. */
- 6, /* load_pred. */
- 6 /* store_pred. */
- }, /* memmov_cost. */
- 2, /* issue_rate */
- AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
- "8", /* function_align. */
- "8", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
- &thunderx_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
-};
-
-static const struct tune_params tsv110_tunings =
-{
- &tsv110_extra_costs,
- &tsv110_addrcost_table,
- &tsv110_regmove_cost,
- &tsv110_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
- | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
- "16", /* function_align. */
- "4", /* jump_align. */
- "8", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &tsv110_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params xgene1_tunings =
-{
- &xgene1_extra_costs,
- &xgene1_addrcost_table,
- &xgene1_regmove_cost,
- &xgene1_vector_cost,
- &generic_branch_cost,
- &xgene1_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 6, /* load_int. */
- 6, /* store_int. */
- 6, /* load_fp. */
- 6, /* store_fp. */
- 6, /* load_pred. */
- 6 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- AARCH64_FUSE_NOTHING, /* fusible_ops */
- "16", /* function_align. */
- "16", /* jump_align. */
- "16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 17, /* max_case_values. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
- &xgene1_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params emag_tunings =
-{
- &xgene1_extra_costs,
- &xgene1_addrcost_table,
- &xgene1_regmove_cost,
- &xgene1_vector_cost,
- &generic_branch_cost,
- &xgene1_approx_modes,
- SVE_NOT_IMPLEMENTED,
- { 6, /* load_int. */
- 6, /* store_int. */
- 6, /* load_fp. */
- 6, /* store_fp. */
- 6, /* load_pred. */
- 6 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- AARCH64_FUSE_NOTHING, /* fusible_ops */
- "16", /* function_align. */
- "16", /* jump_align. */
- "16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 17, /* max_case_values. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
- &xgene1_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params qdf24xx_tunings =
-{
- &qdf24xx_extra_costs,
- &qdf24xx_addrcost_table,
- &qdf24xx_regmove_cost,
- &qdf24xx_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
- "16", /* function_align. */
- "8", /* jump_align. */
- "16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
- &qdf24xx_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
- for now. */
-static const struct tune_params saphira_tunings =
-{
- &generic_extra_costs,
- &generic_addrcost_table,
- &generic_regmove_cost,
- &generic_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
- | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
- "16", /* function_align. */
- "8", /* jump_align. */
- "16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 1, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params thunderx2t99_tunings =
-{
- &thunderx2t99_extra_costs,
- &thunderx2t99_addrcost_table,
- &thunderx2t99_regmove_cost,
- &thunderx2t99_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate. */
- (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
- | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
- "16", /* function_align. */
- "8", /* jump_align. */
- "16", /* loop_align. */
- 3, /* int_reassoc_width. */
- 2, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &thunderx2t99_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params thunderx3t110_tunings =
-{
- &thunderx3t110_extra_costs,
- &thunderx3t110_addrcost_table,
- &thunderx3t110_regmove_cost,
- &thunderx3t110_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 6, /* issue_rate. */
- (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
- | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
- "16", /* function_align. */
- "8", /* jump_align. */
- "16", /* loop_align. */
- 3, /* int_reassoc_width. */
- 2, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &thunderx3t110_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params neoversen1_tunings =
-{
- &cortexa76_extra_costs,
- &generic_addrcost_table,
- &generic_regmove_cost,
- &cortexa57_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 2, /* store_int. */
- 5, /* load_fp. */
- 2, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32:16", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params ampere1_tunings =
-{
- &ere1_extra_costs,
- &generic_addrcost_table,
- &generic_regmove_cost,
- &ere1_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
- AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
- AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
- AARCH64_FUSE_CMP_BRANCH),
- /* fusible_ops */
- "32", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 4, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &ere1_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
-};
-
-static const struct tune_params ampere1a_tunings =
-{
- &ere1a_extra_costs,
- &generic_addrcost_table,
- &generic_regmove_cost,
- &ere1_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_NOT_IMPLEMENTED, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 4, /* issue_rate */
- (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
- AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
- AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
- AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
- AARCH64_FUSE_ADDSUB_2REG_CONST1),
- /* fusible_ops */
- "32", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &ere1_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
-};
-
-static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 4, /* ld2_st2_permute_cost */
- 4, /* ld3_st3_permute_cost */
- 5, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 4, /* reduc_i8_cost */
- 4, /* reduc_i16_cost */
- 2, /* reduc_i32_cost */
- 2, /* reduc_i64_cost */
- 6, /* reduc_f16_cost */
- 3, /* reduc_f32_cost */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* This depends very much on what the scalar value is and
- where it comes from. E.g. some constants take two dependent
- instructions or a load, while others might be moved from a GPR.
- 4 seems to be a reasonable compromise in practice. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-static const sve_vec_cost neoversev1_sve_vector_cost =
-{
- {
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 4, /* ld2_st2_permute_cost */
- 7, /* ld3_st3_permute_cost */
- 8, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- /* Theoretically, a reduction involving 31 scalar ADDs could
- complete in ~9 cycles and would have a cost of 31. [SU]ADDV
- completes in 14 cycles, so give it a cost of 31 + 5. */
- 36, /* reduc_i8_cost */
- /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
- 22, /* reduc_i16_cost */
- /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
- 14, /* reduc_i32_cost */
- /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
- 11, /* reduc_i64_cost */
- /* Theoretically, a reduction involving 15 scalar FADDs could
- complete in ~9 cycles and would have a cost of 30. FADDV
- completes in 13 cycles, so give it a cost of 30 + 4. */
- 34, /* reduc_f16_cost */
- /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
- 19, /* reduc_f32_cost */
- /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
- 11, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* See the comment above the Advanced SIMD versions. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
- },
- 3, /* clast_cost */
- 19, /* fadda_f16_cost */
- 11, /* fadda_f32_cost */
- 8, /* fadda_f64_cost */
- 32, /* gather_load_x32_cost */
- 16, /* gather_load_x64_cost */
- 3 /* scatter_store_elt_cost */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
-{
- 3, /* loads_stores_per_cycle */
- 2, /* stores_per_cycle */
- 4, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
-{
- {
- 3, /* loads_stores_per_cycle */
- 2, /* stores_per_cycle */
- 4, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 2, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
-};
-
-static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
-{
- {
- {
- 2, /* loads_per_cycle */
- 2, /* stores_per_cycle */
- 2, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 2, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
- },
- 1, /* pred_ops_per_cycle */
- 2, /* while_pred_ops */
- 2, /* int_cmp_pred_ops */
- 1, /* fp_cmp_pred_ops */
- 1, /* gather_scatter_pair_general_ops */
- 1 /* gather_scatter_pair_pred_ops */
-};
-
-static const aarch64_vec_issue_info neoversev1_vec_issue_info =
-{
- &neoversev1_scalar_issue_info,
- &neoversev1_advsimd_issue_info,
- &neoversev1_sve_issue_info
-};
-
-/* Neoverse V1 costs for vector insn classes. */
-static const struct cpu_vector_cost neoversev1_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 2, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &neoversev1_advsimd_vector_cost, /* advsimd */
- &neoversev1_sve_vector_cost, /* sve */
- &neoversev1_vec_issue_info /* issue_info */
-};
-
-static const struct tune_params neoversev1_tunings =
-{
- &cortexa76_extra_costs,
- &neoversev1_addrcost_table,
- &neoversev1_regmove_cost,
- &neoversev1_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_256, /* sve_width */
- { 4, /* load_int. */
- 2, /* store_int. */
- 6, /* load_fp. */
- 2, /* store_fp. */
- 6, /* load_pred. */
- 1 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32:16", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 4, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
- | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
- | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const sve_vec_cost neoverse512tvb_sve_vector_cost =
-{
- {
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 4, /* ld2_st2_permute_cost */
- 5, /* ld3_st3_permute_cost */
- 5, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- /* Theoretically, a reduction involving 15 scalar ADDs could
- complete in ~5 cycles and would have a cost of 15. Assume that
- [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
- 21, /* reduc_i8_cost */
- /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
- 13, /* reduc_i16_cost */
- /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
- 9, /* reduc_i32_cost */
- /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
- 8, /* reduc_i64_cost */
- /* Theoretically, a reduction involving 7 scalar FADDs could
- complete in ~6 cycles and would have a cost of 14. Assume that
- FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
- 16, /* reduc_f16_cost */
- /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
- 8, /* reduc_f32_cost */
- /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
- 4, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* This depends very much on what the scalar value is and
- where it comes from. E.g. some constants take two dependent
- instructions or a load, while others might be moved from a GPR.
- 4 seems to be a reasonable compromise in practice. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores generally have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
- },
- 3, /* clast_cost */
- 10, /* fadda_f16_cost */
- 6, /* fadda_f32_cost */
- 4, /* fadda_f64_cost */
- /* A strided Advanced SIMD x64 load would take two parallel FP loads
- (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
- is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
- (cost 8) and a vec_construct (cost 2). Add a full vector operation
- (cost 2) to that, to avoid the difference being lost in rounding.
-
- There is no easy comparison between a strided Advanced SIMD x32 load
- and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
- operation more than a 64-bit gather. */
- 14, /* gather_load_x32_cost */
- 12, /* gather_load_x64_cost */
- 3 /* scatter_store_elt_cost */
-};
-
-static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
-{
- {
- {
- 3, /* loads_per_cycle */
- 2, /* stores_per_cycle */
- 4, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 2, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
- },
- 2, /* pred_ops_per_cycle */
- 2, /* while_pred_ops */
- 2, /* int_cmp_pred_ops */
- 1, /* fp_cmp_pred_ops */
- 1, /* gather_scatter_pair_general_ops */
- 1 /* gather_scatter_pair_pred_ops */
-};
-
-static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
-{
- &neoversev1_scalar_issue_info,
- &neoversev1_advsimd_issue_info,
- &neoverse512tvb_sve_issue_info
-};
-
-static const struct cpu_vector_cost neoverse512tvb_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 2, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &neoversev1_advsimd_vector_cost, /* advsimd */
- &neoverse512tvb_sve_vector_cost, /* sve */
- &neoverse512tvb_vec_issue_info /* issue_info */
-};
-
-static const struct tune_params neoverse512tvb_tunings =
-{
- &cortexa76_extra_costs,
- &neoversev1_addrcost_table,
- &neoversev1_regmove_cost,
- &neoverse512tvb_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_128 | SVE_256, /* sve_width */
- { 4, /* load_int. */
- 2, /* store_int. */
- 6, /* load_fp. */
- 2, /* store_fp. */
- 6, /* load_pred. */
- 1 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32:16", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 4, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
- | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 2, /* ld2_st2_permute_cost */
- 2, /* ld3_st3_permute_cost */
- 3, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 4, /* reduc_i8_cost */
- 4, /* reduc_i16_cost */
- 2, /* reduc_i32_cost */
- 2, /* reduc_i64_cost */
- 6, /* reduc_f16_cost */
- 4, /* reduc_f32_cost */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* This depends very much on what the scalar value is and
- where it comes from. E.g. some constants take two dependent
- instructions or a load, while others might be moved from a GPR.
- 4 seems to be a reasonable compromise in practice. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-static const sve_vec_cost neoversen2_sve_vector_cost =
-{
- {
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 3, /* ld2_st2_permute_cost */
- 4, /* ld3_st3_permute_cost */
- 4, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- /* Theoretically, a reduction involving 15 scalar ADDs could
- complete in ~5 cycles and would have a cost of 15. [SU]ADDV
- completes in 11 cycles, so give it a cost of 15 + 6. */
- 21, /* reduc_i8_cost */
- /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
- 13, /* reduc_i16_cost */
- /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
- 9, /* reduc_i32_cost */
- /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
- 2, /* reduc_i64_cost */
- /* Theoretically, a reduction involving 7 scalar FADDs could
- complete in ~8 cycles and would have a cost of 14. FADDV
- completes in 6 cycles, so give it a cost of 14 - 2. */
- 12, /* reduc_f16_cost */
- /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
- 6, /* reduc_f32_cost */
- /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* See the comment above the Advanced SIMD versions. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
- },
- 3, /* clast_cost */
- 10, /* fadda_f16_cost */
- 6, /* fadda_f32_cost */
- 4, /* fadda_f64_cost */
- /* A strided Advanced SIMD x64 load would take two parallel FP loads
- (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
- is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
- (cost 8) and a vec_construct (cost 2). Add a full vector operation
- (cost 2) to that, to avoid the difference being lost in rounding.
-
- There is no easy comparison between a strided Advanced SIMD x32 load
- and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
- operation more than a 64-bit gather. */
- 14, /* gather_load_x32_cost */
- 12, /* gather_load_x64_cost */
- 3 /* scatter_store_elt_cost */
-};
-
-static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
-{
- 3, /* loads_stores_per_cycle */
- 2, /* stores_per_cycle */
- 4, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
-{
- {
- 3, /* loads_stores_per_cycle */
- 2, /* stores_per_cycle */
- 2, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 2, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
-};
-
-static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
-{
- {
- {
- 3, /* loads_per_cycle */
- 2, /* stores_per_cycle */
- 2, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 3, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
- },
- 2, /* pred_ops_per_cycle */
- 2, /* while_pred_ops */
- 2, /* int_cmp_pred_ops */
- 1, /* fp_cmp_pred_ops */
- 1, /* gather_scatter_pair_general_ops */
- 1 /* gather_scatter_pair_pred_ops */
-};
-
-static const aarch64_vec_issue_info neoversen2_vec_issue_info =
-{
- &neoversen2_scalar_issue_info,
- &neoversen2_advsimd_issue_info,
- &neoversen2_sve_issue_info
-};
-
-/* Neoverse N2 costs for vector insn classes. */
-static const struct cpu_vector_cost neoversen2_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 2, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &neoversen2_advsimd_vector_cost, /* advsimd */
- &neoversen2_sve_vector_cost, /* sve */
- &neoversen2_vec_issue_info /* issue_info */
-};
-
-static const struct tune_params neoversen2_tunings =
-{
- &cortexa76_extra_costs,
- &neoversen2_addrcost_table,
- &neoversen2_regmove_cost,
- &neoversen2_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_128, /* sve_width */
- { 4, /* load_int. */
- 1, /* store_int. */
- 6, /* load_fp. */
- 2, /* store_fp. */
- 6, /* load_pred. */
- 1 /* store_pred. */
- }, /* memmov_cost. */
- 3, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32:16", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
- | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
- | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
-{
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 2, /* ld2_st2_permute_cost */
- 2, /* ld3_st3_permute_cost */
- 3, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- 4, /* reduc_i8_cost */
- 4, /* reduc_i16_cost */
- 2, /* reduc_i32_cost */
- 2, /* reduc_i64_cost */
- 6, /* reduc_f16_cost */
- 3, /* reduc_f32_cost */
- 2, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* This depends very much on what the scalar value is and
- where it comes from. E.g. some constants take two dependent
- instructions or a load, while others might be moved from a GPR.
- 4 seems to be a reasonable compromise in practice. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
-};
-
-static const sve_vec_cost neoversev2_sve_vector_cost =
-{
- {
- 2, /* int_stmt_cost */
- 2, /* fp_stmt_cost */
- 3, /* ld2_st2_permute_cost */
- 3, /* ld3_st3_permute_cost */
- 4, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
- /* Theoretically, a reduction involving 15 scalar ADDs could
- complete in ~3 cycles and would have a cost of 15. [SU]ADDV
- completes in 11 cycles, so give it a cost of 15 + 8. */
- 21, /* reduc_i8_cost */
- /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
- 14, /* reduc_i16_cost */
- /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
- 7, /* reduc_i32_cost */
- /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
- 2, /* reduc_i64_cost */
- /* Theoretically, a reduction involving 7 scalar FADDs could
- complete in ~6 cycles and would have a cost of 14. FADDV
- completes in 8 cycles, so give it a cost of 14 + 2. */
- 16, /* reduc_f16_cost */
- /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
- 8, /* reduc_f32_cost */
- /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
- 4, /* reduc_f64_cost */
- 2, /* store_elt_extra_cost */
- /* This value is just inherited from the Cortex-A57 table. */
- 8, /* vec_to_scalar_cost */
- /* See the comment above the Advanced SIMD versions. */
- 4, /* scalar_to_vec_cost */
- 4, /* align_load_cost */
- 4, /* unalign_load_cost */
- /* Although stores have a latency of 2 and compete for the
- vector pipes, in practice it's better not to model that. */
- 1, /* unalign_store_cost */
- 1 /* store_cost */
- },
- 3, /* clast_cost */
- 10, /* fadda_f16_cost */
- 6, /* fadda_f32_cost */
- 4, /* fadda_f64_cost */
- /* A strided Advanced SIMD x64 load would take two parallel FP loads
- (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
- is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
- (cost 8) and a vec_construct (cost 2). Add a full vector operation
- (cost 2) to that, to avoid the difference being lost in rounding.
-
- There is no easy comparison between a strided Advanced SIMD x32 load
- and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
- operation more than a 64-bit gather. */
- 14, /* gather_load_x32_cost */
- 12, /* gather_load_x64_cost */
- 3 /* scatter_store_elt_cost */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
-{
- 3, /* loads_stores_per_cycle */
- 2, /* stores_per_cycle */
- 6, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
-{
- {
- 3, /* loads_stores_per_cycle */
- 2, /* stores_per_cycle */
- 4, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 2, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
-};
-
-static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
-{
- {
- {
- 3, /* loads_per_cycle */
- 2, /* stores_per_cycle */
- 4, /* general_ops_per_cycle */
- 0, /* fp_simd_load_general_ops */
- 1 /* fp_simd_store_general_ops */
- },
- 2, /* ld2_st2_general_ops */
- 3, /* ld3_st3_general_ops */
- 3 /* ld4_st4_general_ops */
- },
- 2, /* pred_ops_per_cycle */
- 2, /* while_pred_ops */
- 2, /* int_cmp_pred_ops */
- 1, /* fp_cmp_pred_ops */
- 1, /* gather_scatter_pair_general_ops */
- 1 /* gather_scatter_pair_pred_ops */
-};
-
-static const aarch64_vec_issue_info neoversev2_vec_issue_info =
-{
- &neoversev2_scalar_issue_info,
- &neoversev2_advsimd_issue_info,
- &neoversev2_sve_issue_info
-};
-
-/* Demeter costs for vector insn classes. */
-static const struct cpu_vector_cost neoversev2_vector_cost =
-{
- 1, /* scalar_int_stmt_cost */
- 2, /* scalar_fp_stmt_cost */
- 4, /* scalar_load_cost */
- 1, /* scalar_store_cost */
- 1, /* cond_taken_branch_cost */
- 1, /* cond_not_taken_branch_cost */
- &neoversev2_advsimd_vector_cost, /* advsimd */
- &neoversev2_sve_vector_cost, /* sve */
- &neoversev2_vec_issue_info /* issue_info */
-};
-
-static const struct tune_params neoversev2_tunings =
-{
- &cortexa76_extra_costs,
- &neoversev2_addrcost_table,
- &neoversev2_regmove_cost,
- &neoversev2_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_128, /* sve_width */
- { 4, /* load_int. */
- 2, /* store_int. */
- 6, /* load_fp. */
- 1, /* store_fp. */
- 6, /* load_pred. */
- 2 /* store_pred. */
- }, /* memmov_cost. */
- 5, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32:16", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 3, /* int_reassoc_width. */
- 6, /* fp_reassoc_width. */
- 4, /* fma_reassoc_width. */
- 3, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
- | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
- | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
- &generic_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
-
-static const struct tune_params a64fx_tunings =
-{
- &a64fx_extra_costs,
- &a64fx_addrcost_table,
- &a64fx_regmove_cost,
- &a64fx_vector_cost,
- &generic_branch_cost,
- &generic_approx_modes,
- SVE_512, /* sve_width */
- { 4, /* load_int. */
- 4, /* store_int. */
- 4, /* load_fp. */
- 4, /* store_fp. */
- 4, /* load_pred. */
- 4 /* store_pred. */
- }, /* memmov_cost. */
- 7, /* issue_rate */
- (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32", /* function_align. */
- "16", /* jump_align. */
- "32", /* loop_align. */
- 4, /* int_reassoc_width. */
- 2, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
- &a64fx_prefetch_tune,
- AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
- AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
-};
+#include "tuning_models/generic.h"
+#include "tuning_models/cortexa35.h"
+#include "tuning_models/cortexa53.h"
+#include "tuning_models/cortexa57.h"
+#include "tuning_models/cortexa72.h"
+#include "tuning_models/cortexa73.h"
+#include "tuning_models/exynosm1.h"
+#include "tuning_models/thunderxt88.h"
+#include "tuning_models/thunderx.h"
+#include "tuning_models/tsv110.h"
+#include "tuning_models/xgene1.h"
+#include "tuning_models/emag.h"
+#include "tuning_models/qdf24xx.h"
+#include "tuning_models/saphira.h"
+#include "tuning_models/thunderx2t99.h"
+#include "tuning_models/thunderx3t110.h"
+#include "tuning_models/neoversen1.h"
+#include "tuning_models/ampere1.h"
+#include "tuning_models/ampere1a.h"
+#include "tuning_models/neoversev1.h"
+#include "tuning_models/neoverse512tvb.h"
+#include "tuning_models/neoversen2.h"
+#include "tuning_models/neoversev2.h"
+#include "tuning_models/a64fx.h"
/* Support for fine-grained override of the tuning structures. */
struct aarch64_tuning_override_function
diff --git a/gcc/config/aarch64/tuning_models/a64fx.h b/gcc/config/aarch64/tuning_models/a64fx.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b06c27eba1e4de01738bdfdc077460f9135fb41
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/a64fx.h
@@ -0,0 +1,169 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_A64FX
+#define GCC_AARCH64_H_A64FX
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table a64fx_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 1, /* si */
+ 1, /* di */
+ 2, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 0, /* post_modify_ld3_st3 */
+ 0, /* post_modify_ld4_st4 */
+ 2, /* register_offset */
+ 3, /* register_sextend */
+ 3, /* register_zextend */
+ 0, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost a64fx_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 5, /* GP2FP */
+ 7, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost a64fx_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 5, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 13, /* reduc_i8_cost */
+ 13, /* reduc_i16_cost */
+ 13, /* reduc_i32_cost */
+ 13, /* reduc_i64_cost */
+ 13, /* reduc_f16_cost */
+ 13, /* reduc_f32_cost */
+ 13, /* reduc_f64_cost */
+ 13, /* store_elt_extra_cost */
+ 13, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 6, /* align_load_cost */
+ 6, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const sve_vec_cost a64fx_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 5, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 13, /* reduc_i8_cost */
+ 13, /* reduc_i16_cost */
+ 13, /* reduc_i32_cost */
+ 13, /* reduc_i64_cost */
+ 13, /* reduc_f16_cost */
+ 13, /* reduc_f32_cost */
+ 13, /* reduc_f64_cost */
+ 13, /* store_elt_extra_cost */
+ 13, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 6, /* align_load_cost */
+ 6, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 13, /* clast_cost */
+ 13, /* fadda_f16_cost */
+ 13, /* fadda_f32_cost */
+ 13, /* fadda_f64_cost */
+ 64, /* gather_load_x32_cost */
+ 32, /* gather_load_x64_cost */
+ 1 /* scatter_store_elt_cost */
+};
+
+static const struct cpu_vector_cost a64fx_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 5, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 3, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &a64fx_advsimd_vector_cost, /* advsimd */
+ &a64fx_sve_vector_cost, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune a64fx_prefetch_tune =
+{
+ 8, /* num_slots */
+ 64, /* l1_cache_size */
+ 256, /* l1_cache_line_size */
+ 32768, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params a64fx_tunings =
+{
+ &a64fx_extra_costs,
+ &a64fx_addrcost_table,
+ &a64fx_regmove_cost,
+ &a64fx_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_512, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 7, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "32", /* function_align. */
+ "16", /* jump_align. */
+ "32", /* loop_align. */
+ 4, /* int_reassoc_width. */
+ 2, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &a64fx_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_A64FX. */
diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d2a1c696103259f23cf73df26cef9d4fa05ac73
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/ampere1.h
@@ -0,0 +1,113 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_AMPERE1
+#define GCC_AARCH64_H_AMPERE1
+
+#include "generic.h"
+
+static const advsimd_vec_cost ampere1_advsimd_vector_cost =
+{
+ 1, /* int_stmt_cost */
+ 3, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 12, /* reduc_i8_cost */
+ 9, /* reduc_i16_cost */
+ 6, /* reduc_i32_cost */
+ 5, /* reduc_i64_cost */
+ 9, /* reduc_f16_cost */
+ 6, /* reduc_f32_cost */
+ 5, /* reduc_f64_cost */
+ 8, /* store_elt_extra_cost */
+ 6, /* vec_to_scalar_cost */
+ 7, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+/* Ampere-1 costs for vector insn classes. */
+static const struct cpu_vector_cost ampere1_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 3, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &ere1_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune ampere1_prefetch_tune =
+{
+ 0, /* num_slots */
+ 64, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 2048, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params ampere1_tunings =
+{
+ &ere1_extra_costs,
+ &generic_addrcost_table,
+ &generic_regmove_cost,
+ &ere1_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+ AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+ AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+ AARCH64_FUSE_CMP_BRANCH),
+ /* fusible_ops */
+ "32", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 4, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &ere1_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_AMPERE1. */
diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
new file mode 100644
index 0000000000000000000000000000000000000000..c419ffb3c1a936a01690ad157c6c71dc645273c8
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/ampere1a.h
@@ -0,0 +1,65 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_AMPERE1A
+#define GCC_AARCH64_H_AMPERE1A
+
+#include "generic.h"
+
+static const struct tune_params ampere1a_tunings =
+{
+ &ere1a_extra_costs,
+ &generic_addrcost_table,
+ &generic_regmove_cost,
+ &ere1_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+ AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+ AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+ AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
+ AARCH64_FUSE_ADDSUB_2REG_CONST1),
+ /* fusible_ops */
+ "32", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &ere1_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_AMPERE1A. */
diff --git a/gcc/config/aarch64/tuning_models/cortexa35.h b/gcc/config/aarch64/tuning_models/cortexa35.h
new file mode 100644
index 0000000000000000000000000000000000000000..5534335348db96cc57fc9eccd7ff79a624cb528a
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa35.h
@@ -0,0 +1,62 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_CORTEXA35
+#define GCC_AARCH64_H_CORTEXA35
+
+#include "generic.h"
+#include "cortexa53.h"
+
+static const struct tune_params cortexa35_tunings =
+{
+ &cortexa53_extra_costs,
+ &generic_addrcost_table,
+ &cortexa53_regmove_cost,
+ &generic_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 1, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
+ "16", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA35. */
diff --git a/gcc/config/aarch64/tuning_models/cortexa53.h b/gcc/config/aarch64/tuning_models/cortexa53.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfdccc5968e7f062af5c78f153bfe3838263b0a
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa53.h
@@ -0,0 +1,71 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_CORTEXA53
+#define GCC_AARCH64_H_CORTEXA53
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost cortexa53_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 5, /* GP2FP */
+ 5, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const struct tune_params cortexa53_tunings =
+{
+ &cortexa53_extra_costs,
+ &generic_addrcost_table,
+ &cortexa53_regmove_cost,
+ &generic_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 2, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
+ "16", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA53. */
diff --git a/gcc/config/aarch64/tuning_models/cortexa57.h b/gcc/config/aarch64/tuning_models/cortexa57.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c4789d57833a5879dda8e2fe454ac5f56cb0601
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa57.h
@@ -0,0 +1,109 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_CORTEXA57
+#define GCC_AARCH64_H_CORTEXA57
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost cortexa57_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 5, /* GP2FP */
+ 5, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 8, /* reduc_i8_cost */
+ 8, /* reduc_i16_cost */
+ 8, /* reduc_i32_cost */
+ 8, /* reduc_i64_cost */
+ 8, /* reduc_f16_cost */
+ 8, /* reduc_f32_cost */
+ 8, /* reduc_f64_cost */
+ 8, /* store_elt_extra_cost */
+ 8, /* vec_to_scalar_cost */
+ 8, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+/* Cortex-A57 costs for vector insn classes. */
+static const struct cpu_vector_cost cortexa57_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &cortexa57_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const struct tune_params cortexa57_tunings =
+{
+ &cortexa57_extra_costs,
+ &generic_addrcost_table,
+ &cortexa57_regmove_cost,
+ &cortexa57_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
+ "16", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA57. */
diff --git a/gcc/config/aarch64/tuning_models/cortexa72.h b/gcc/config/aarch64/tuning_models/cortexa72.h
new file mode 100644
index 0000000000000000000000000000000000000000..968171c9b2e898d7479dbcb462e33fe3905e183d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa72.h
@@ -0,0 +1,61 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_CORTEXA72
+#define GCC_AARCH64_H_CORTEXA72
+
+#include "generic.h"
+
+static const struct tune_params cortexa72_tunings =
+{
+ &cortexa57_extra_costs,
+ &generic_addrcost_table,
+ &cortexa57_regmove_cost,
+ &cortexa57_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
+ "16", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA72. */
diff --git a/gcc/config/aarch64/tuning_models/cortexa73.h b/gcc/config/aarch64/tuning_models/cortexa73.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d1a504ddac39604dd193ce0f434fd2f5145c129
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa73.h
@@ -0,0 +1,62 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_CORTEXA73
+#define GCC_AARCH64_H_CORTEXA73
+
+#include "generic.h"
+
+static const struct tune_params cortexa73_tunings =
+{
+ &cortexa57_extra_costs,
+ &generic_addrcost_table,
+ &cortexa57_regmove_cost,
+ &cortexa57_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 2, /* issue_rate. */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
+ "16", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+
+#endif /* GCC_AARCH64_H_CORTEXA73. */
diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f3402c3fc2a94704eeaf9223ecb0ca1c057cace
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/emag.h
@@ -0,0 +1,60 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_EMAG
+#define GCC_AARCH64_H_EMAG
+
+#include "generic.h"
+
+static const struct tune_params emag_tunings =
+{
+ &xgene1_extra_costs,
+ &xgene1_addrcost_table,
+ &xgene1_regmove_cost,
+ &xgene1_vector_cost,
+ &generic_branch_cost,
+ &xgene1_approx_modes,
+ SVE_NOT_IMPLEMENTED,
+ { 6, /* load_int. */
+ 6, /* store_int. */
+ 6, /* load_fp. */
+ 6, /* store_fp. */
+ 6, /* load_pred. */
+ 6 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ AARCH64_FUSE_NOTHING, /* fusible_ops */
+ "16", /* function_align. */
+ "16", /* jump_align. */
+ "16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 17, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
+ &xgene1_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_EMAG. */
diff --git a/gcc/config/aarch64/tuning_models/exynosm1.h b/gcc/config/aarch64/tuning_models/exynosm1.h
new file mode 100644
index 0000000000000000000000000000000000000000..a42ea4df97f3f048c41481c304fd3684a69d743b
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/exynosm1.h
@@ -0,0 +1,144 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_EXYNOSM1
+#define GCC_AARCH64_H_EXYNOSM1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table exynosm1_addrcost_table =
+{
+ {
+ 0, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 2, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 0, /* post_modify_ld3_st3 */
+ 0, /* post_modify_ld4_st4 */
+ 1, /* register_offset */
+ 1, /* register_sextend */
+ 2, /* register_zextend */
+ 0, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost exynosm1_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost (actual, 4 and 9). */
+ 9, /* GP2FP */
+ 9, /* FP2GP */
+ 1 /* FP2FP */
+};
+
+static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
+{
+ 3, /* int_stmt_cost */
+ 3, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 3, /* reduc_i8_cost */
+ 3, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 3, /* reduc_i64_cost */
+ 3, /* reduc_f16_cost */
+ 3, /* reduc_f32_cost */
+ 3, /* reduc_f64_cost */
+ 3, /* store_elt_extra_cost */
+ 3, /* vec_to_scalar_cost */
+ 3, /* scalar_to_vec_cost */
+ 5, /* align_load_cost */
+ 5, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const struct cpu_vector_cost exynosm1_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 5, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &exynosm1_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+/* Approximation modes for Exynos M1. */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+ AARCH64_APPROX_NONE, /* division */
+ AARCH64_APPROX_ALL, /* sqrt */
+ AARCH64_APPROX_ALL /* recip_sqrt */
+};
+
+static const cpu_prefetch_tune exynosm1_prefetch_tune =
+{
+ 0, /* num_slots */
+ -1, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params exynosm1_tunings =
+{
+ &exynosm1_extra_costs,
+ &exynosm1_addrcost_table,
+ &exynosm1_regmove_cost,
+ &exynosm1_vector_cost,
+ &generic_branch_cost,
+ &exynosm1_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
+ "4", /* function_align. */
+ "4", /* jump_align. */
+ "4", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 48, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &exynosm1_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_EXYNOSM1. */
diff --git a/gcc/config/aarch64/tuning_models/generic.h b/gcc/config/aarch64/tuning_models/generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..deb2c1cffe255bddcb5be571b12086442782da60
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -0,0 +1,190 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+ Contributed by ARM Ltd.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_GENERIC
+#define GCC_AARCH64_H_GENERIC
+
+static const struct cpu_addrcost_table generic_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 0, /* post_modify_ld3_st3 */
+ 0, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 0, /* register_sextend */
+ 0, /* register_zextend */
+ 0 /* imm_offset */
+};
+
+static const struct cpu_regmove_cost generic_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 5, /* GP2FP */
+ 5, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+/* Generic costs for Advanced SIMD vector operations. */
+static const advsimd_vec_cost generic_advsimd_vector_cost =
+{
+ 1, /* int_stmt_cost */
+ 1, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 2, /* vec_to_scalar_cost */
+ 1, /* scalar_to_vec_cost */
+ 1, /* align_load_cost */
+ 1, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+/* Generic costs for SVE vector operations. */
+static const sve_vec_cost generic_sve_vector_cost =
+{
+ {
+ 1, /* int_stmt_cost */
+ 1, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 2, /* vec_to_scalar_cost */
+ 1, /* scalar_to_vec_cost */
+ 1, /* align_load_cost */
+ 1, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 2, /* clast_cost */
+ 2, /* fadda_f16_cost */
+ 2, /* fadda_f32_cost */
+ 2, /* fadda_f64_cost */
+ 4, /* gather_load_x32_cost */
+ 2, /* gather_load_x64_cost */
+ 1 /* scatter_store_elt_cost */
+};
+
+/* Generic costs for vector insn classes. */
+static const struct cpu_vector_cost generic_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 1, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 3, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &generic_advsimd_vector_cost, /* advsimd */
+ &generic_sve_vector_cost, /* sve */
+ nullptr /* issue_info */
+};
+
+/* Generic costs for branch instructions. */
+static const struct cpu_branch_cost generic_branch_cost =
+{
+ 1, /* Predictable. */
+ 3 /* Unpredictable. */
+};
+
+/* Generic approximation modes. */
+static const cpu_approx_modes generic_approx_modes =
+{
+ AARCH64_APPROX_NONE, /* division */
+ AARCH64_APPROX_NONE, /* sqrt */
+ AARCH64_APPROX_NONE /* recip_sqrt */
+};
+
+/* Generic prefetch settings (which disable prefetch). */
+static const cpu_prefetch_tune generic_prefetch_tune =
+{
+ 0, /* num_slots */
+ -1, /* l1_cache_size */
+ -1, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params generic_tunings =
+{
+ &cortexa57_extra_costs,
+ &generic_addrcost_table,
+ &generic_regmove_cost,
+ &generic_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 2, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "16:12", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
+ Neoverse V1. It does not have a noticeable effect on A64FX and should
+ have at most a very minor effect on SVE2 cores. */
+ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_GENERIC. */
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
new file mode 100644
index 0000000000000000000000000000000000000000..50d7b23712cc6a8be8f35246657ec5d86d6d4191
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -0,0 +1,164 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_NEOVERSE512TVB
+#define GCC_AARCH64_H_NEOVERSE512TVB
+
+#include "generic.h"
+
+static const sve_vec_cost neoverse512tvb_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 4, /* ld2_st2_permute_cost */
+ 5, /* ld3_st3_permute_cost */
+ 5, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ /* Theoretically, a reduction involving 15 scalar ADDs could
+ complete in ~5 cycles and would have a cost of 15. Assume that
+ [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
+ 21, /* reduc_i8_cost */
+ /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
+ 13, /* reduc_i16_cost */
+ /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
+ 9, /* reduc_i32_cost */
+ /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
+ 8, /* reduc_i64_cost */
+ /* Theoretically, a reduction involving 7 scalar FADDs could
+ complete in ~6 cycles and would have a cost of 14. Assume that
+ FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
+ 16, /* reduc_f16_cost */
+ /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
+ 8, /* reduc_f32_cost */
+ /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
+ 4, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* This depends very much on what the scalar value is and
+ where it comes from. E.g. some constants take two dependent
+ instructions or a load, while others might be moved from a GPR.
+ 4 seems to be a reasonable compromise in practice. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores generally have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 10, /* fadda_f16_cost */
+ 6, /* fadda_f32_cost */
+ 4, /* fadda_f64_cost */
+ /* A strided Advanced SIMD x64 load would take two parallel FP loads
+ (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
+ is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
+ (cost 8) and a vec_construct (cost 2). Add a full vector operation
+ (cost 2) to that, to avoid the difference being lost in rounding.
+
+ There is no easy comparison between a strided Advanced SIMD x32 load
+ and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+ operation more than a 64-bit gather. */
+ 14, /* gather_load_x32_cost */
+ 12, /* gather_load_x64_cost */
+ 3 /* scatter_store_elt_cost */
+};
+
+static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
+{
+ {
+ {
+ 3, /* loads_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 2, /* pred_ops_per_cycle */
+ 2, /* while_pred_ops */
+ 2, /* int_cmp_pred_ops */
+ 1, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
+{
+ &neoversev1_scalar_issue_info,
+ &neoversev1_advsimd_issue_info,
+ &neoverse512tvb_sve_issue_info
+};
+
+static const struct cpu_vector_cost neoverse512tvb_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &neoversev1_advsimd_vector_cost, /* advsimd */
+ &neoverse512tvb_sve_vector_cost, /* sve */
+ &neoverse512tvb_vec_issue_info /* issue_info */
+};
+
+static const struct tune_params neoverse512tvb_tunings =
+{
+ &cortexa76_extra_costs,
+ &neoversev1_addrcost_table,
+ &neoversev1_regmove_cost,
+ &neoverse512tvb_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_128 | SVE_256, /* sve_width */
+ { 4, /* load_int. */
+ 2, /* store_int. */
+ 6, /* load_fp. */
+ 2, /* store_fp. */
+ 6, /* load_pred. */
+ 1 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 4, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSE512TVB. */
diff --git a/gcc/config/aarch64/tuning_models/neoversen1.h b/gcc/config/aarch64/tuning_models/neoversen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..132166d3d06430b725e4448937332cc159c11cda
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversen1.h
@@ -0,0 +1,60 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_NEOVERSEN1
+#define GCC_AARCH64_H_NEOVERSEN1
+
+#include "generic.h"
+
+static const struct tune_params neoversen1_tunings =
+{
+ &cortexa76_extra_costs,
+ &generic_addrcost_table,
+ &generic_regmove_cost,
+ &cortexa57_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 2, /* store_int. */
+ 5, /* load_fp. */
+ 2, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEN1. */
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h
new file mode 100644
index 0000000000000000000000000000000000000000..395a6d82b8403e586bf179cade055543cf9b9eb0
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -0,0 +1,245 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_NEOVERSEN2
+#define GCC_AARCH64_H_NEOVERSEN2
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversen2_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 2, /* post_modify_ld3_st3 */
+ 2, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 0, /* register_sextend */
+ 0, /* register_zextend */
+ 0 /* imm_offset */
+};
+
+static const struct cpu_regmove_cost neoversen2_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Spilling to int<->fp instead of memory is recommended so set
+ realistic costs compared to memmov_cost. */
+ 3, /* GP2FP */
+ 2, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 2, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 4, /* reduc_i8_cost */
+ 4, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 6, /* reduc_f16_cost */
+ 4, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* This depends very much on what the scalar value is and
+ where it comes from. E.g. some constants take two dependent
+ instructions or a load, while others might be moved from a GPR.
+ 4 seems to be a reasonable compromise in practice. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const sve_vec_cost neoversen2_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 3, /* ld2_st2_permute_cost */
+ 4, /* ld3_st3_permute_cost */
+ 4, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ /* Theoretically, a reduction involving 15 scalar ADDs could
+ complete in ~5 cycles and would have a cost of 15. [SU]ADDV
+ completes in 11 cycles, so give it a cost of 15 + 6. */
+ 21, /* reduc_i8_cost */
+ /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
+ 13, /* reduc_i16_cost */
+ /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
+ 9, /* reduc_i32_cost */
+ /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
+ 2, /* reduc_i64_cost */
+ /* Theoretically, a reduction involving 7 scalar FADDs could
+ complete in ~8 cycles and would have a cost of 14. FADDV
+ completes in 6 cycles, so give it a cost of 14 - 2. */
+ 12, /* reduc_f16_cost */
+ /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
+ 6, /* reduc_f32_cost */
+ /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* See the comment above the Advanced SIMD versions. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 10, /* fadda_f16_cost */
+ 6, /* fadda_f32_cost */
+ 4, /* fadda_f64_cost */
+ /* A strided Advanced SIMD x64 load would take two parallel FP loads
+ (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
+ is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
+ (cost 8) and a vec_construct (cost 2). Add a full vector operation
+ (cost 2) to that, to avoid the difference being lost in rounding.
+
+ There is no easy comparison between a strided Advanced SIMD x32 load
+ and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+ operation more than a 64-bit gather. */
+ 14, /* gather_load_x32_cost */
+ 12, /* gather_load_x64_cost */
+ 3 /* scatter_store_elt_cost */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
+{
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
+{
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+};
+
+static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
+{
+ {
+ {
+ 3, /* loads_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 3, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 2, /* pred_ops_per_cycle */
+ 2, /* while_pred_ops */
+ 2, /* int_cmp_pred_ops */
+ 1, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static const aarch64_vec_issue_info neoversen2_vec_issue_info =
+{
+ &neoversen2_scalar_issue_info,
+ &neoversen2_advsimd_issue_info,
+ &neoversen2_sve_issue_info
+};
+
+/* Neoverse N2 costs for vector insn classes. */
+static const struct cpu_vector_cost neoversen2_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &neoversen2_advsimd_vector_cost, /* advsimd */
+ &neoversen2_sve_vector_cost, /* sve */
+ &neoversen2_vec_issue_info /* issue_info */
+};
+
+static const struct tune_params neoversen2_tunings =
+{
+ &cortexa76_extra_costs,
+ &neoversen2_addrcost_table,
+ &neoversen2_regmove_cost,
+ &neoversen2_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_128, /* sve_width */
+ { 4, /* load_int. */
+ 1, /* store_int. */
+ 6, /* load_fp. */
+ 2, /* store_fp. */
+ 6, /* load_pred. */
+ 1 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+ | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEN2. */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h
new file mode 100644
index 0000000000000000000000000000000000000000..584a5000e06f598dcdd3bcc533dc6dbc642223ca
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -0,0 +1,237 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV1
+#define GCC_AARCH64_H_NEOVERSEV1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev1_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 3, /* post_modify_ld3_st3 */
+ 3, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 0, /* register_sextend */
+ 0, /* register_zextend */
+ 0 /* imm_offset */
+};
+
+static const struct cpu_regmove_cost neoversev1_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Spilling to int<->fp instead of memory is recommended so set
+ realistic costs compared to memmov_cost. */
+ 3, /* GP2FP */
+ 2, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 4, /* ld2_st2_permute_cost */
+ 4, /* ld3_st3_permute_cost */
+ 5, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 4, /* reduc_i8_cost */
+ 4, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 6, /* reduc_f16_cost */
+ 3, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* This depends very much on what the scalar value is and
+ where it comes from. E.g. some constants take two dependent
+ instructions or a load, while others might be moved from a GPR.
+ 4 seems to be a reasonable compromise in practice. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const sve_vec_cost neoversev1_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 4, /* ld2_st2_permute_cost */
+ 7, /* ld3_st3_permute_cost */
+ 8, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ /* Theoretically, a reduction involving 31 scalar ADDs could
+ complete in ~9 cycles and would have a cost of 31. [SU]ADDV
+ completes in 14 cycles, so give it a cost of 31 + 5. */
+ 36, /* reduc_i8_cost */
+ /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
+ 22, /* reduc_i16_cost */
+ /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
+ 14, /* reduc_i32_cost */
+ /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
+ 11, /* reduc_i64_cost */
+ /* Theoretically, a reduction involving 15 scalar FADDs could
+ complete in ~9 cycles and would have a cost of 30. FADDV
+ completes in 13 cycles, so give it a cost of 30 + 4. */
+ 34, /* reduc_f16_cost */
+ /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
+ 19, /* reduc_f32_cost */
+ /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
+ 11, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* See the comment above the Advanced SIMD versions. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 19, /* fadda_f16_cost */
+ 11, /* fadda_f32_cost */
+ 8, /* fadda_f64_cost */
+ 32, /* gather_load_x32_cost */
+ 16, /* gather_load_x64_cost */
+ 3 /* scatter_store_elt_cost */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
+{
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
+{
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+};
+
+static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
+{
+ {
+ {
+ 2, /* loads_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 1, /* pred_ops_per_cycle */
+ 2, /* while_pred_ops */
+ 2, /* int_cmp_pred_ops */
+ 1, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static const aarch64_vec_issue_info neoversev1_vec_issue_info =
+{
+ &neoversev1_scalar_issue_info,
+ &neoversev1_advsimd_issue_info,
+ &neoversev1_sve_issue_info
+};
+
+/* Neoverse V1 costs for vector insn classes. */
+static const struct cpu_vector_cost neoversev1_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &neoversev1_advsimd_vector_cost, /* advsimd */
+ &neoversev1_sve_vector_cost, /* sve */
+ &neoversev1_vec_issue_info /* issue_info */
+};
+
+static const struct tune_params neoversev1_tunings =
+{
+ &cortexa76_extra_costs,
+ &neoversev1_addrcost_table,
+ &neoversev1_regmove_cost,
+ &neoversev1_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_256, /* sve_width */
+ { 4, /* load_int. */
+ 2, /* store_int. */
+ 6, /* load_fp. */
+ 2, /* store_fp. */
+ 6, /* load_pred. */
+ 1 /* store_pred. */
+ }, /* memmov_cost. */
+ 3, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 4, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+ | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+
+#endif /* GCC_AARCH64_H_NEOVERSEV1. */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d4244ef4c99ecdffb7408e39dc21bc191223de
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -0,0 +1,245 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV2
+#define GCC_AARCH64_H_NEOVERSEV2
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev2_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 2, /* post_modify_ld3_st3 */
+ 2, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 0, /* register_sextend */
+ 0, /* register_zextend */
+ 0 /* imm_offset */
+};
+
+static const struct cpu_regmove_cost neoversev2_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Spilling to int<->fp instead of memory is recommended so set
+ realistic costs compared to memmov_cost. */
+ 3, /* GP2FP */
+ 2, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 2, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 4, /* reduc_i8_cost */
+ 4, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 6, /* reduc_f16_cost */
+ 3, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* This depends very much on what the scalar value is and
+ where it comes from. E.g. some constants take two dependent
+ instructions or a load, while others might be moved from a GPR.
+ 4 seems to be a reasonable compromise in practice. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const sve_vec_cost neoversev2_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 3, /* ld2_st2_permute_cost */
+ 3, /* ld3_st3_permute_cost */
+ 4, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ /* Theoretically, a reduction involving 15 scalar ADDs could
+ complete in ~3 cycles and would have a cost of 15. [SU]ADDV
+ completes in 11 cycles, so give it a cost of 15 + 8. */
+ 21, /* reduc_i8_cost */
+ /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
+ 14, /* reduc_i16_cost */
+ /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
+ 7, /* reduc_i32_cost */
+ /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
+ 2, /* reduc_i64_cost */
+ /* Theoretically, a reduction involving 7 scalar FADDs could
+ complete in ~6 cycles and would have a cost of 14. FADDV
+ completes in 8 cycles, so give it a cost of 14 + 2. */
+ 16, /* reduc_f16_cost */
+ /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
+ 8, /* reduc_f32_cost */
+ /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
+ 4, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* See the comment above the Advanced SIMD versions. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 10, /* fadda_f16_cost */
+ 6, /* fadda_f32_cost */
+ 4, /* fadda_f64_cost */
+ /* A strided Advanced SIMD x64 load would take two parallel FP loads
+ (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
+ is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
+ (cost 8) and a vec_construct (cost 2). Add a full vector operation
+ (cost 2) to that, to avoid the difference being lost in rounding.
+
+ There is no easy comparison between a strided Advanced SIMD x32 load
+ and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+ operation more than a 64-bit gather. */
+ 14, /* gather_load_x32_cost */
+ 12, /* gather_load_x64_cost */
+ 3 /* scatter_store_elt_cost */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
+{
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 6, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
+{
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+};
+
+static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
+{
+ {
+ {
+ 3, /* loads_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 3, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 2, /* pred_ops_per_cycle */
+ 2, /* while_pred_ops */
+ 2, /* int_cmp_pred_ops */
+ 1, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static const aarch64_vec_issue_info neoversev2_vec_issue_info =
+{
+ &neoversev2_scalar_issue_info,
+ &neoversev2_advsimd_issue_info,
+ &neoversev2_sve_issue_info
+};
+
+/* Demeter costs for vector insn classes. */
+static const struct cpu_vector_cost neoversev2_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &neoversev2_advsimd_vector_cost, /* advsimd */
+ &neoversev2_sve_vector_cost, /* sve */
+ &neoversev2_vec_issue_info /* issue_info */
+};
+
+static const struct tune_params neoversev2_tunings =
+{
+ &cortexa76_extra_costs,
+ &neoversev2_addrcost_table,
+ &neoversev2_regmove_cost,
+ &neoversev2_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_128, /* sve_width */
+ { 4, /* load_int. */
+ 2, /* store_int. */
+ 6, /* load_fp. */
+ 1, /* store_fp. */
+ 6, /* load_pred. */
+ 2 /* store_pred. */
+ }, /* memmov_cost. */
+ 5, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 3, /* int_reassoc_width. */
+ 6, /* fp_reassoc_width. */
+ 4, /* fma_reassoc_width. */
+ 3, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+ | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEV2. */
diff --git a/gcc/config/aarch64/tuning_models/qdf24xx.h b/gcc/config/aarch64/tuning_models/qdf24xx.h
new file mode 100644
index 0000000000000000000000000000000000000000..29c9b9f5843acc15450a2492b141c02ee48a3f13
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/qdf24xx.h
@@ -0,0 +1,137 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_QDF24XX
+#define GCC_AARCH64_H_QDF24XX
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 1, /* si */
+ 1, /* di */
+ 2, /* ti */
+ },
+ 1, /* pre_modify */
+ 1, /* post_modify */
+ 1, /* post_modify_ld3_st3 */
+ 1, /* post_modify_ld4_st4 */
+ 3, /* register_offset */
+ 3, /* register_sextend */
+ 3, /* register_zextend */
+ 2, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
+{
+ 2, /* GP2GP */
+ /* Avoid the use of int<->fp moves for spilling. */
+ 6, /* GP2FP */
+ 6, /* FP2GP */
+ 4 /* FP2FP */
+};
+
+static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
+{
+ 1, /* int_stmt_cost */
+ 3, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 1, /* reduc_i8_cost */
+ 1, /* reduc_i16_cost */
+ 1, /* reduc_i32_cost */
+ 1, /* reduc_i64_cost */
+ 1, /* reduc_f16_cost */
+ 1, /* reduc_f32_cost */
+ 1, /* reduc_f64_cost */
+ 1, /* store_elt_extra_cost */
+ 1, /* vec_to_scalar_cost */
+ 1, /* scalar_to_vec_cost */
+ 1, /* align_load_cost */
+ 1, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+/* QDF24XX costs for vector insn classes. */
+static const struct cpu_vector_cost qdf24xx_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 1, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 3, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &qdf24xx_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune qdf24xx_prefetch_tune =
+{
+ 4, /* num_slots */
+ 32, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 512, /* l2_cache_size */
+ false, /* prefetch_dynamic_strides */
+ 2048, /* minimum_stride */
+ 3 /* default_opt_level */
+};
+
+static const struct tune_params qdf24xx_tunings =
+{
+ &qdf24xx_extra_costs,
+ &qdf24xx_addrcost_table,
+ &qdf24xx_regmove_cost,
+ &qdf24xx_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
+ "16", /* function_align. */
+ "8", /* jump_align. */
+ "16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
+ &qdf24xx_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_QDF24XX. */
diff --git a/gcc/config/aarch64/tuning_models/saphira.h b/gcc/config/aarch64/tuning_models/saphira.h
new file mode 100644
index 0000000000000000000000000000000000000000..e584d316bb7c3c2d232cf7623a92100ad261f07d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/saphira.h
@@ -0,0 +1,63 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_SAPHIRA
+#define GCC_AARCH64_H_SAPHIRA
+
+#include "generic.h"
+
+/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
+ for now. */
+static const struct tune_params saphira_tunings =
+{
+ &generic_extra_costs,
+ &generic_addrcost_table,
+ &generic_regmove_cost,
+ &generic_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+ | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
+ "16", /* function_align. */
+ "8", /* jump_align. */
+ "16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_SAPHIRA. */
diff --git a/gcc/config/aarch64/tuning_models/thunderx.h b/gcc/config/aarch64/tuning_models/thunderx.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd4b9d539fc5cf2bd20d84e91d6b72fa7237f99f
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx.h
@@ -0,0 +1,117 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_THUNDERX
+#define GCC_AARCH64_H_THUNDERX
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost thunderx_regmove_cost =
+{
+ 2, /* GP2GP */
+ 2, /* GP2FP */
+ 6, /* FP2GP */
+ 4 /* FP2FP */
+};
+
+static const advsimd_vec_cost thunderx_advsimd_vector_cost =
+{
+ 4, /* int_stmt_cost */
+ 1, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 4, /* permute_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ 2, /* vec_to_scalar_cost */
+ 2, /* scalar_to_vec_cost */
+ 3, /* align_load_cost */
+ 5, /* unalign_load_cost */
+ 5, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+/* ThunderX costs for vector insn classes. */
+static const struct cpu_vector_cost thunderx_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 3, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 3, /* cond_taken_branch_cost */
+ 3, /* cond_not_taken_branch_cost */
+ &thunderx_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune thunderx_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 128, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params thunderx_tunings =
+{
+ &thunderx_extra_costs,
+ &generic_addrcost_table,
+ &thunderx_regmove_cost,
+ &thunderx_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 6, /* load_int. */
+ 6, /* store_int. */
+ 6, /* load_fp. */
+ 6, /* store_fp. */
+ 6, /* load_pred. */
+ 6 /* store_pred. */
+ }, /* memmov_cost. */
+ 2, /* issue_rate */
+ AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
+ "8", /* function_align. */
+ "8", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ &thunderx_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX. */
diff --git a/gcc/config/aarch64/tuning_models/thunderx2t99.h b/gcc/config/aarch64/tuning_models/thunderx2t99.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a376e0bab37b0b5bc1ea23de0e96a9245846fd7
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx2t99.h
@@ -0,0 +1,137 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_THUNDERX2T99
+#define GCC_AARCH64_H_THUNDERX2T99
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 1, /* si */
+ 1, /* di */
+ 2, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 0, /* post_modify_ld3_st3 */
+ 0, /* post_modify_ld4_st4 */
+ 2, /* register_offset */
+ 3, /* register_sextend */
+ 3, /* register_zextend */
+ 0, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of int<->fp moves for spilling. */
+ 5, /* GP2FP */
+ 6, /* FP2GP */
+ 3, /* FP2FP */
+};
+
+static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
+{
+ 4, /* int_stmt_cost */
+ 5, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 10, /* permute_cost */
+ 6, /* reduc_i8_cost */
+ 6, /* reduc_i16_cost */
+ 6, /* reduc_i32_cost */
+ 6, /* reduc_i64_cost */
+ 6, /* reduc_f16_cost */
+ 6, /* reduc_f32_cost */
+ 6, /* reduc_f64_cost */
+ 6, /* store_elt_extra_cost */
+ 6, /* vec_to_scalar_cost */
+ 5, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+/* Costs for vector insn classes for Vulcan. */
+static const struct cpu_vector_cost thunderx2t99_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 6, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 2, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &thunderx2t99_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 256, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params thunderx2t99_tunings =
+{
+ &thunderx2t99_extra_costs,
+ &thunderx2t99_addrcost_table,
+ &thunderx2t99_regmove_cost,
+ &thunderx2t99_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate. */
+ (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
+ "16", /* function_align. */
+ "8", /* jump_align. */
+ "16", /* loop_align. */
+ 3, /* int_reassoc_width. */
+ 2, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &thunderx2t99_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX2T99. */
diff --git a/gcc/config/aarch64/tuning_models/thunderx3t110.h b/gcc/config/aarch64/tuning_models/thunderx3t110.h
new file mode 100644
index 0000000000000000000000000000000000000000..65203b4af132e12e4994013fbab228bd3873b756
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx3t110.h
@@ -0,0 +1,136 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_THUNDERX3T110
+#define GCC_AARCH64_H_THUNDERX3T110
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 1, /* si */
+ 1, /* di */
+ 2, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 0, /* post_modify_ld3_st3 */
+ 0, /* post_modify_ld4_st4 */
+ 2, /* register_offset */
+ 3, /* register_sextend */
+ 3, /* register_zextend */
+ 0, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of int<->fp moves for spilling. */
+ 4, /* GP2FP */
+ 5, /* FP2GP */
+ 4 /* FP2FP */
+};
+
+static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
+{
+ 5, /* int_stmt_cost */
+ 5, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 10, /* permute_cost */
+ 5, /* reduc_i8_cost */
+ 5, /* reduc_i16_cost */
+ 5, /* reduc_i32_cost */
+ 5, /* reduc_i64_cost */
+ 5, /* reduc_f16_cost */
+ 5, /* reduc_f32_cost */
+ 5, /* reduc_f64_cost */
+ 5, /* store_elt_extra_cost */
+ 5, /* vec_to_scalar_cost */
+ 5, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ 4, /* unalign_store_cost */
+ 4 /* store_cost */
+};
+
+static const struct cpu_vector_cost thunderx3t110_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 5, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 2, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &thunderx3t110_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 256, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params thunderx3t110_tunings =
+{
+ &thunderx3t110_extra_costs,
+ &thunderx3t110_addrcost_table,
+ &thunderx3t110_regmove_cost,
+ &thunderx3t110_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 6, /* issue_rate. */
+ (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
+ "16", /* function_align. */
+ "8", /* jump_align. */
+ "16", /* loop_align. */
+ 3, /* int_reassoc_width. */
+ 2, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &thunderx3t110_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX3T110. */
diff --git a/gcc/config/aarch64/tuning_models/thunderxt88.h b/gcc/config/aarch64/tuning_models/thunderxt88.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcc74d31484ee6b99d37920dbfe7b1d59377d074
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderxt88.h
@@ -0,0 +1,72 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_THUNDERXT88
+#define GCC_AARCH64_H_THUNDERXT88
+
+#include "generic.h"
+#include "thunderx.h"
+
+static const cpu_prefetch_tune thunderxt88_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 128, /* l1_cache_line_size */
+ 16*1024, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ 3 /* default_opt_level */
+};
+
+static const struct tune_params thunderxt88_tunings =
+{
+ &thunderx_extra_costs,
+ &generic_addrcost_table,
+ &thunderx_regmove_cost,
+ &thunderx_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 6, /* load_int. */
+ 6, /* store_int. */
+ 6, /* load_fp. */
+ 6, /* store_fp. */
+ 6, /* load_pred. */
+ 6 /* store_pred. */
+ }, /* memmov_cost. */
+ 2, /* issue_rate */
+ AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
+ "8", /* function_align. */
+ "8", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &thunderxt88_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERXT88. */
diff --git a/gcc/config/aarch64/tuning_models/tsv110.h b/gcc/config/aarch64/tuning_models/tsv110.h
new file mode 100644
index 0000000000000000000000000000000000000000..42aeafce652fff34e3277194993dd4aa1f0383a1
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/tsv110.h
@@ -0,0 +1,137 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_TSV110
+#define GCC_AARCH64_H_TSV110
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table tsv110_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 0, /* post_modify_ld3_st3 */
+ 0, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 1, /* register_sextend */
+ 1, /* register_zextend */
+ 0, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost tsv110_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 2, /* GP2FP */
+ 3, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost tsv110_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 3, /* reduc_i8_cost */
+ 3, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 3, /* reduc_i64_cost */
+ 3, /* reduc_f16_cost */
+ 3, /* reduc_f32_cost */
+ 3, /* reduc_f64_cost */
+ 3, /* store_elt_extra_cost */
+ 3, /* vec_to_scalar_cost */
+ 2, /* scalar_to_vec_cost */
+ 5, /* align_load_cost */
+ 5, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const struct cpu_vector_cost tsv110_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 5, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &tsv110_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+static const cpu_prefetch_tune tsv110_prefetch_tune =
+{
+ 0, /* num_slots */
+ 64, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 512, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params tsv110_tunings =
+{
+ &tsv110_extra_costs,
+ &tsv110_addrcost_table,
+ &tsv110_regmove_cost,
+ &tsv110_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 4, /* store_int. */
+ 4, /* load_fp. */
+ 4, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
+ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
+ "16", /* function_align. */
+ "4", /* jump_align. */
+ "8", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &tsv110_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_TSV110. */
diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h
new file mode 100644
index 0000000000000000000000000000000000000000..53a3eb0ddeb80a9735cc988e242a70e87dc90655
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/xgene1.h
@@ -0,0 +1,145 @@
+/* Tuning model description for AArch64 architecture.
+ Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#ifndef GCC_AARCH64_H_XGENE1
+#define GCC_AARCH64_H_XGENE1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table xgene1_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 1, /* pre_modify */
+ 1, /* post_modify */
+ 1, /* post_modify_ld3_st3 */
+ 1, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 1, /* register_sextend */
+ 1, /* register_zextend */
+ 0, /* imm_offset */
+};
+
+static const struct cpu_regmove_cost xgene1_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 8, /* GP2FP */
+ 8, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost xgene1_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
+ 4, /* reduc_i8_cost */
+ 4, /* reduc_i16_cost */
+ 4, /* reduc_i32_cost */
+ 4, /* reduc_i64_cost */
+ 4, /* reduc_f16_cost */
+ 4, /* reduc_f32_cost */
+ 4, /* reduc_f64_cost */
+ 4, /* store_elt_extra_cost */
+ 4, /* vec_to_scalar_cost */
+ 4, /* scalar_to_vec_cost */
+ 10, /* align_load_cost */
+ 10, /* unalign_load_cost */
+ 2, /* unalign_store_cost */
+ 2 /* store_cost */
+};
+
+/* Generic costs for vector insn classes. */
+static const struct cpu_vector_cost xgene1_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 5, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 2, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &xgene1_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ nullptr /* issue_info */
+};
+
+/* Approximation modes for X-Gene 1. */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+ AARCH64_APPROX_NONE, /* division */
+ AARCH64_APPROX_NONE, /* sqrt */
+ AARCH64_APPROX_ALL /* recip_sqrt */
+};
+
+static const cpu_prefetch_tune xgene1_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 256, /* l2_cache_size */
+ true, /* prefetch_dynamic_strides */
+ -1, /* minimum_stride */
+ -1 /* default_opt_level */
+};
+
+static const struct tune_params xgene1_tunings =
+{
+ &xgene1_extra_costs,
+ &xgene1_addrcost_table,
+ &xgene1_regmove_cost,
+ &xgene1_vector_cost,
+ &generic_branch_cost,
+ &xgene1_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 6, /* load_int. */
+ 6, /* store_int. */
+ 6, /* load_fp. */
+ 6, /* store_fp. */
+ 6, /* load_pred. */
+ 6 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ AARCH64_FUSE_NOTHING, /* fusible_ops */
+ "16", /* function_align. */
+ "16", /* jump_align. */
+ "16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 17, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
+ &xgene1_prefetch_tune,
+ AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
+ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
+};
+
+#endif /* GCC_AARCH64_H_XGENE1. */