From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1923) id 094EA3857BA4; Mon, 14 Nov 2022 13:52:32 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 094EA3857BA4 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1668433952; bh=WcGj9r38uUeVl4MWhsbuyGa9Pj+pZgPbH8kKRMNdkKA=; h=From:To:Subject:Date:From; b=V9IcKjnM1ZfB/gfZk/h5N0QNXe8KsnWS+eMaSR0dP6XttHi8kRccdJcysriLFbl3R XW0qbfUJdfcJmt+qsXXdPbw96Vi4T4PvIqpI6DMzD8cAVDdY4IRhpB4Kwnb9oLonOq 47P7UW4fK7J6vbhrsznweXCVKnBg5WKdbXMLTyPU= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Philipp Tomsich To: gcc-cvs@gcc.gnu.org Subject: [gcc r13-4018] aarch64: Add support for Ampere-1A (-mcpu=ampere1a) CPU X-Act-Checkin: gcc X-Git-Author: Philipp Tomsich X-Git-Refname: refs/heads/master X-Git-Oldrev: 5ba25973e2f403ee48af2ba579af5017b2f650fb X-Git-Newrev: 590a06afbf0e96813b5879742f38f3665512c854 Message-Id: <20221114135232.094EA3857BA4@sourceware.org> Date: Mon, 14 Nov 2022 13:52:32 +0000 (GMT) List-Id: https://gcc.gnu.org/g:590a06afbf0e96813b5879742f38f3665512c854 commit r13-4018-g590a06afbf0e96813b5879742f38f3665512c854 Author: Philipp Tomsich Date: Mon Nov 7 14:22:21 2022 +0100 aarch64: Add support for Ampere-1A (-mcpu=ampere1a) CPU This patch adds support for Ampere-1A CPU: - recognize the name of the core and provide detection for -mcpu=native, - updated extra_costs, - adds a new fusion pair for (A+B+1 and A-B-1). Ampere-1A and Ampere-1 have more timing difference than the extra costs indicate, but these don't propagate through to the headline items in our extra costs (e.g. the change in latency for scalar sqrt doesn't have a corresponding table entry). gcc/ChangeLog: * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a. * config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs. * config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR): Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two registers and then +1/-1). * config/aarch64/aarch64-tune.md: Regenerate. * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement idiom-matcher for the new fusion pair. * doc/invoke.texi: Add ampere1a. Diff: --- gcc/config/aarch64/aarch64-cores.def | 1 + gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++++++++++++ gcc/config/aarch64/aarch64-fusion-pairs.def | 1 + gcc/config/aarch64/aarch64-tune.md | 2 +- gcc/config/aarch64/aarch64.cc | 64 +++++++++++++++++ gcc/doc/invoke.texi | 2 +- 6 files changed, 175 insertions(+), 2 deletions(-) diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index d2671778928..aead587cec1 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu /* Ampere Computing ('\xC0') cores. */ AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1) +AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1) /* Do not swap around "emag" and "xgene1", this order is required to handle variant correctly. */ AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3) diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index 760d7b30368..48522606fbe 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -775,4 +775,111 @@ const struct cpu_cost_table ampere1_extra_costs = } }; +const struct cpu_cost_table ampere1a_extra_costs = +{ + /* ALU */ + { + 0, /* arith. */ + 0, /* logical. */ + 0, /* shift. */ + COSTS_N_INSNS (1), /* shift_reg. */ + 0, /* arith_shift. */ + COSTS_N_INSNS (1), /* arith_shift_reg. */ + 0, /* log_shift. */ + COSTS_N_INSNS (1), /* log_shift_reg. */ + 0, /* extend. */ + COSTS_N_INSNS (1), /* extend_arith. */ + 0, /* bfi. */ + 0, /* bfx. */ + 0, /* clz. */ + 0, /* rev. */ + 0, /* non_exec. */ + true /* non_exec_costs_exec. */ + }, + { + /* MULT SImode */ + { + COSTS_N_INSNS (3), /* simple. */ + COSTS_N_INSNS (3), /* flag_setting. */ + COSTS_N_INSNS (3), /* extend. */ + COSTS_N_INSNS (4), /* add. */ + COSTS_N_INSNS (4), /* extend_add. */ + COSTS_N_INSNS (19) /* idiv. */ + }, + /* MULT DImode */ + { + COSTS_N_INSNS (3), /* simple. */ + 0, /* flag_setting (N/A). */ + COSTS_N_INSNS (3), /* extend. */ + COSTS_N_INSNS (4), /* add. */ + COSTS_N_INSNS (4), /* extend_add. */ + COSTS_N_INSNS (35) /* idiv. */ + } + }, + /* LD/ST */ + { + COSTS_N_INSNS (4), /* load. */ + COSTS_N_INSNS (4), /* load_sign_extend. */ + 0, /* ldrd (n/a). */ + 0, /* ldm_1st. */ + 0, /* ldm_regs_per_insn_1st. */ + 0, /* ldm_regs_per_insn_subsequent. */ + COSTS_N_INSNS (5), /* loadf. */ + COSTS_N_INSNS (5), /* loadd. */ + COSTS_N_INSNS (5), /* load_unaligned. */ + 0, /* store. */ + 0, /* strd. */ + 0, /* stm_1st. */ + 0, /* stm_regs_per_insn_1st. */ + 0, /* stm_regs_per_insn_subsequent. */ + COSTS_N_INSNS (2), /* storef. */ + COSTS_N_INSNS (2), /* stored. */ + COSTS_N_INSNS (2), /* store_unaligned. */ + COSTS_N_INSNS (3), /* loadv. */ + COSTS_N_INSNS (3) /* storev. */ + }, + { + /* FP SFmode */ + { + COSTS_N_INSNS (25), /* div. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (4), /* mult_addsub. */ + COSTS_N_INSNS (4), /* fma. */ + COSTS_N_INSNS (4), /* addsub. */ + COSTS_N_INSNS (2), /* fpconst. */ + COSTS_N_INSNS (4), /* neg. */ + COSTS_N_INSNS (4), /* compare. */ + COSTS_N_INSNS (4), /* widen. */ + COSTS_N_INSNS (4), /* narrow. */ + COSTS_N_INSNS (4), /* toint. */ + COSTS_N_INSNS (4), /* fromint. */ + COSTS_N_INSNS (4) /* roundint. */ + }, + /* FP DFmode */ + { + COSTS_N_INSNS (34), /* div. */ + COSTS_N_INSNS (5), /* mult. */ + COSTS_N_INSNS (5), /* mult_addsub. */ + COSTS_N_INSNS (5), /* fma. */ + COSTS_N_INSNS (5), /* addsub. */ + COSTS_N_INSNS (2), /* fpconst. */ + COSTS_N_INSNS (5), /* neg. */ + COSTS_N_INSNS (5), /* compare. */ + COSTS_N_INSNS (5), /* widen. */ + COSTS_N_INSNS (5), /* narrow. */ + COSTS_N_INSNS (6), /* toint. */ + COSTS_N_INSNS (6), /* fromint. */ + COSTS_N_INSNS (5) /* roundint. */ + } + }, + /* Vector */ + { + COSTS_N_INSNS (3), /* alu. */ + COSTS_N_INSNS (3), /* mult. */ + COSTS_N_INSNS (2), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ + } +}; + #endif diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def index c064fb9b85d..d91f8a2babd 100644 --- a/gcc/config/aarch64/aarch64-fusion-pairs.def +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def @@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH) AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC) AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH) AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ) +AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1) #undef AARCH64_FUSION_PAIR diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index 22ec1be5a4c..b7d6fc8cc88 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index d1f979ebcf8..a7f7c3c0121 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -1921,6 +1921,43 @@ static const struct tune_params ampere1_tunings = &ere1_prefetch_tune }; +static const struct tune_params ampere1a_tunings = +{ + &ere1a_extra_costs, + &generic_addrcost_table, + &generic_regmove_cost, + &ere1_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_NOT_IMPLEMENTED, /* sve_width */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ + 4, /* issue_rate */ + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | + AARCH64_FUSE_ADDSUB_2REG_CONST1), + /* fusible_ops */ + "32", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &ere1_prefetch_tune +}; + static const advsimd_vec_cost neoversev1_advsimd_vector_cost = { 2, /* int_stmt_cost */ @@ -25539,6 +25576,33 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + /* Fuse A+B+1 and A-B-1 */ + if (simple_sets_p + && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1)) + { + /* We're trying to match: + prev == (set (r0) (plus (r0) (r1))) + curr == (set (r0) (plus (r0) (const_int 1))) + or: + prev == (set (r0) (minus (r0) (r1))) + curr == (set (r0) (plus (r0) (const_int -1))) */ + + rtx prev_src = SET_SRC (prev_set); + rtx curr_src = SET_SRC (curr_set); + + int polarity = 1; + if (GET_CODE (prev_src) == MINUS) + polarity = -1; + + if (GET_CODE (curr_src) == PLUS + && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS) + && CONST_INT_P (XEXP (curr_src, 1)) + && INTVAL (XEXP (curr_src, 1)) == polarity + && REG_P (XEXP (curr_src, 0)) + && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0))) + return true; + } + return false; } diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 449df597298..2e4433d9a05 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -19995,7 +19995,7 @@ performance of the code. Permissible values for this option are: @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55}, @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2}, @samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{ampere1}, -@samp{native}. +@samp{ampere1a}, and @samp{native}. The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},