diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 4c61e35..885ccff 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -102,6 +102,7 @@ extern int arm_early_load_addr_dep (rtx, rtx); extern int arm_no_early_alu_shift_dep (rtx, rtx); extern int arm_no_early_alu_shift_value_dep (rtx, rtx); extern int arm_no_early_mul_dep (rtx, rtx); +extern int arm_mac_accumulator_is_result (rtx, rtx); extern int arm_mac_accumulator_is_mul_result (rtx, rtx); extern int tls_mentioned_p (rtx); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 13d745f..39f1eb3 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -24610,6 +24610,62 @@ arm_cxx_guard_type (void) return TARGET_AAPCS_BASED ? integer_type_node : long_long_integer_type_node; } +/* Return non-zero iff the consumer (a multiply-accumulate or a + multiple-subtract instruction) has an accumulator dependency on the + result of the producer and no other dependency on that result. It + does not check if the producer is multiply-accumulate instruction. */ +int +arm_mac_accumulator_is_result (rtx producer, rtx consumer) +{ + rtx result; + rtx op0, op1, acc; + + producer = PATTERN (producer); + consumer = PATTERN (consumer); + + if (GET_CODE (producer) == COND_EXEC) + producer = COND_EXEC_CODE (producer); + if (GET_CODE (consumer) == COND_EXEC) + consumer = COND_EXEC_CODE (consumer); + + if (GET_CODE (producer) != SET) + return 0; + + result = XEXP (producer, 0); + + if (GET_CODE (consumer) != SET) + return 0; + + /* Check that the consumer is of the form + (set (...) (plus (mult ...) (...))) + or + (set (...) (minus (...) (mult ...))). */ + if (GET_CODE (XEXP (consumer, 1)) == PLUS) + { + if (GET_CODE (XEXP (XEXP (consumer, 1), 0)) != MULT) + return 0; + + op0 = XEXP (XEXP (XEXP (consumer, 1), 0), 0); + op1 = XEXP (XEXP (XEXP (consumer, 1), 0), 1); + acc = XEXP (XEXP (consumer, 1), 1); + } + else if (GET_CODE (XEXP (consumer, 1)) == MINUS) + { + if (GET_CODE (XEXP (XEXP (consumer, 1), 1)) != MULT) + return 0; + + op0 = XEXP (XEXP (XEXP (consumer, 1), 1), 0); + op1 = XEXP (XEXP (XEXP (consumer, 1), 1), 1); + acc = XEXP (XEXP (consumer, 1), 0); + } + else + return 0; + + return (reg_overlap_mentioned_p (result, acc) + && !reg_overlap_mentioned_p (result, op0) + && !reg_overlap_mentioned_p (result, op1)); +} + /* Return non-zero if the consumer (a multiply-accumulate instruction) has an accumulator dependency on the result of the producer (a multiplication instruction) and no other dependency on that result. */ diff --git a/gcc/config/arm/cortex-a7.md b/gcc/config/arm/cortex-a7.md index 930242d..2cef5fd 100644 --- a/gcc/config/arm/cortex-a7.md +++ b/gcc/config/arm/cortex-a7.md @@ -137,6 +137,12 @@ (eq_attr "neon_type" "none"))) "cortex_a7_both") +;; Forward the result of a multiply operation to the accumulator +;; of the following multiply and accumulate instruction. +(define_bypass 1 "cortex_a7_mul" + "cortex_a7_mul" + "arm_mac_accumulator_is_result") + ;; The latency depends on the operands, so we use an estimate here. (define_insn_reservation "cortex_a7_idiv" 5 (and (eq_attr "tune" "cortexa7") @@ -264,6 +271,10 @@ neon_fp_vmla_qqq_scalar")) "cortex_a7_both+cortex_a7_fpmul_pipe") +(define_bypass 4 "cortex_a7_fpmacs,cortex_a7_neon_mla" + "cortex_a7_fpmacs,cortex_a7_neon_mla" + "arm_mac_accumulator_is_result") + ;; Non-multiply instructions can issue between two cycles of a ;; double-precision multiply. @@ -285,6 +296,10 @@ (eq_attr "neon_type" "none"))) "cortex_a7_ex1+cortex_a7_fpmul_pipe, cortex_a7_fpmul_pipe*4") +(define_bypass 7 "cortex_a7_fpmacd" + "cortex_a7_fpmacd,cortex_a7_fpfmad" + "arm_mac_accumulator_is_result") + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Floating-point divide/square root instructions. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;