* [PATCH][AArch64] Improve Cortex-A53 integer scheduler
@ 2016-07-05 15:00 Wilco Dijkstra
2016-07-06 8:41 ` Richard Earnshaw (lists)
0 siblings, 1 reply; 2+ messages in thread
From: Wilco Dijkstra @ 2016-07-05 15:00 UTC (permalink / raw)
To: GCC Patches; +Cc: nd, James Greenhalgh
This patch improves the accuracy of the Cortex-A53 integer scheduler,
resulting in performance gains across a wide range of benchmarks.
OK for commit?
ChangeLog:
2016-07-05 Wilco Dijkstra <wdijkstr@arm.com>
* config/arm/cortex-a53.md: Use final_presence_set for in-order.
(cortex_a53_shift): Add mov_shift.
(cortex_a53_shift_reg): Add new reservation for register shifts.
(cortex_a53_alu): Remove bfm.
(cortex_a53_alu_shift): Add bfm, remove mov_shift.
(cortex_a53_alu_extr): Add new reservation for EXTR.
(bypasses): Improve bypass modelling.
---
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index fc60bc26c7caf7e94064d7f292b877b12f333fca..70c0f4daabe0ccb8e32808f1af51f5460e087a18 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -30,6 +30,7 @@
(define_cpu_unit "cortex_a53_slot0" "cortex_a53")
(define_cpu_unit "cortex_a53_slot1" "cortex_a53")
+(final_presence_set "cortex_a53_slot1" "cortex_a53_slot0")
(define_reservation "cortex_a53_slot_any"
"cortex_a53_slot0\
@@ -71,41 +72,43 @@
(define_insn_reservation "cortex_a53_shift" 2
(and (eq_attr "tune" "cortexa53")
- (eq_attr "type" "adr,shift_imm,shift_reg,mov_imm,mvn_imm"))
+ (eq_attr "type" "adr,shift_imm,mov_imm,mvn_imm,mov_shift"))
"cortex_a53_slot_any")
-(define_insn_reservation "cortex_a53_alu_rotate_imm" 2
+(define_insn_reservation "cortex_a53_shift_reg" 2
(and (eq_attr "tune" "cortexa53")
- (eq_attr "type" "rotate_imm"))
- "(cortex_a53_slot1)
- | (cortex_a53_single_issue)")
+ (eq_attr "type" "shift_reg,mov_shift_reg"))
+ "cortex_a53_slot_any+cortex_a53_hazard")
(define_insn_reservation "cortex_a53_alu" 3
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,
alu_sreg,alus_sreg,logic_reg,logics_reg,
adc_imm,adcs_imm,adc_reg,adcs_reg,
- bfm,csel,clz,rbit,rev,alu_dsp_reg,
- mov_reg,mvn_reg,
- mrs,multiple,no_insn"))
+ csel,clz,rbit,rev,alu_dsp_reg,
+ mov_reg,mvn_reg,mrs,multiple,no_insn"))
"cortex_a53_slot_any")
(define_insn_reservation "cortex_a53_alu_shift" 3
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "alu_shift_imm,alus_shift_imm,
crc,logic_shift_imm,logics_shift_imm,
- alu_ext,alus_ext,
- extend,mov_shift,mvn_shift"))
+ alu_ext,alus_ext,bfm,extend,mvn_shift"))
"cortex_a53_slot_any")
(define_insn_reservation "cortex_a53_alu_shift_reg" 3
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "alu_shift_reg,alus_shift_reg,
logic_shift_reg,logics_shift_reg,
- mov_shift_reg,mvn_shift_reg"))
+ mvn_shift_reg"))
"cortex_a53_slot_any+cortex_a53_hazard")
-(define_insn_reservation "cortex_a53_mul" 3
+(define_insn_reservation "cortex_a53_alu_extr" 3
+ (and (eq_attr "tune" "cortexa53")
+ (eq_attr "type" "rotate_imm"))
+ "cortex_a53_slot1|cortex_a53_single_issue")
+
+(define_insn_reservation "cortex_a53_mul" 4
(and (eq_attr "tune" "cortexa53")
(ior (eq_attr "mul32" "yes")
(eq_attr "mul64" "yes")))
@@ -189,49 +192,43 @@
(define_insn_reservation "cortex_a53_branch" 0
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "branch,call"))
- "cortex_a53_slot_any,cortex_a53_branch")
+ "cortex_a53_slot_any+cortex_a53_branch")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; General-purpose register bypasses
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; Model bypasses for unshifted operands to ALU instructions.
+;; Model bypasses for ALU to ALU instructions.
-(define_bypass 1 "cortex_a53_shift"
- "cortex_a53_shift")
+(define_bypass 0 "cortex_a53_shift*"
+ "cortex_a53_alu")
-(define_bypass 1 "cortex_a53_alu,
- cortex_a53_alu_shift*,
- cortex_a53_alu_rotate_imm,
- cortex_a53_shift"
+(define_bypass 1 "cortex_a53_shift*"
+ "cortex_a53_shift*,cortex_a53_alu_*")
+
+(define_bypass 1 "cortex_a53_alu*"
"cortex_a53_alu")
-(define_bypass 2 "cortex_a53_alu,
- cortex_a53_alu_shift*"
+(define_bypass 1 "cortex_a53_alu*"
"cortex_a53_alu_shift*"
"aarch_forward_to_shift_is_not_shifted_reg")
-;; In our model, we allow any general-purpose register operation to
-;; bypass to the accumulator operand of an integer MADD-like operation.
+(define_bypass 2 "cortex_a53_alu*"
+ "cortex_a53_alu_*,cortex_a53_shift*")
-(define_bypass 1 "cortex_a53_alu*,
- cortex_a53_load*,
- cortex_a53_mul"
+;; Model a bypass from MUL/MLA to MLA instructions.
+
+(define_bypass 1 "cortex_a53_mul"
"cortex_a53_mul"
"aarch_accumulator_forwarding")
-;; Model a bypass from MLA/MUL to many ALU instructions.
+;; Model a bypass from MUL/MLA to ALU instructions.
(define_bypass 2 "cortex_a53_mul"
- "cortex_a53_alu,
- cortex_a53_alu_shift*")
-
-;; We get neater schedules by allowing an MLA/MUL to feed an
-;; early load address dependency to a load.
+ "cortex_a53_alu")
-(define_bypass 2 "cortex_a53_mul"
- "cortex_a53_load*"
- "arm_early_load_addr_dep")
+(define_bypass 3 "cortex_a53_mul"
+ "cortex_a53_alu_*,cortex_a53_shift*")
;; Model bypasses for loads which are to be consumed by the ALU.
@@ -239,47 +236,37 @@
"cortex_a53_alu")
(define_bypass 3 "cortex_a53_load1"
- "cortex_a53_alu_shift*")
+ "cortex_a53_alu_*,cortex_a53_shift*")
+
+(define_bypass 3 "cortex_a53_load2"
+ "cortex_a53_alu")
;; Model a bypass for ALU instructions feeding stores.
-(define_bypass 1 "cortex_a53_alu*"
- "cortex_a53_store1,
- cortex_a53_store2,
- cortex_a53_store3plus"
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
+ "cortex_a53_store*"
"arm_no_early_store_addr_dep")
;; Model a bypass for load and multiply instructions feeding stores.
-(define_bypass 2 "cortex_a53_mul,
- cortex_a53_load1,
- cortex_a53_load2,
- cortex_a53_load3plus"
- "cortex_a53_store1,
- cortex_a53_store2,
- cortex_a53_store3plus"
+(define_bypass 1 "cortex_a53_mul,
+ cortex_a53_load*"
+ "cortex_a53_store*"
"arm_no_early_store_addr_dep")
;; Model a GP->FP register move as similar to stores.
-(define_bypass 1 "cortex_a53_alu*"
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
"cortex_a53_r2f")
-(define_bypass 2 "cortex_a53_mul,
- cortex_a53_load1,
- cortex_a53_load2,
- cortex_a53_load3plus"
+(define_bypass 1 "cortex_a53_mul,
+ cortex_a53_load*"
"cortex_a53_r2f")
-;; Shifts feeding Load/Store addresses may not be ready in time.
+;; Model flag forwarding to branches.
-(define_bypass 3 "cortex_a53_shift"
- "cortex_a53_load*"
- "arm_early_load_addr_dep")
-
-(define_bypass 3 "cortex_a53_shift"
- "cortex_a53_store*"
- "arm_early_store_addr_dep")
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
+ "cortex_a53_branch")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point/Advanced SIMD.
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH][AArch64] Improve Cortex-A53 integer scheduler
2016-07-05 15:00 [PATCH][AArch64] Improve Cortex-A53 integer scheduler Wilco Dijkstra
@ 2016-07-06 8:41 ` Richard Earnshaw (lists)
0 siblings, 0 replies; 2+ messages in thread
From: Richard Earnshaw (lists) @ 2016-07-06 8:41 UTC (permalink / raw)
To: Wilco Dijkstra, GCC Patches; +Cc: nd, James Greenhalgh
On 05/07/16 16:00, Wilco Dijkstra wrote:
> This patch improves the accuracy of the Cortex-A53 integer scheduler,
> resulting in performance gains across a wide range of benchmarks.
>
> OK for commit?
>
OK.
R.
> ChangeLog:
> 2016-07-05 Wilco Dijkstra <wdijkstr@arm.com>
>
> * config/arm/cortex-a53.md: Use final_presence_set for in-order.
> (cortex_a53_shift): Add mov_shift.
> (cortex_a53_shift_reg): Add new reservation for register shifts.
> (cortex_a53_alu): Remove bfm.
> (cortex_a53_alu_shift): Add bfm, remove mov_shift.
> (cortex_a53_alu_extr): Add new reservation for EXTR.
> (bypasses): Improve bypass modelling.
>
> ---
> diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
> index fc60bc26c7caf7e94064d7f292b877b12f333fca..70c0f4daabe0ccb8e32808f1af51f5460e087a18 100644
> --- a/gcc/config/arm/cortex-a53.md
> +++ b/gcc/config/arm/cortex-a53.md
> @@ -30,6 +30,7 @@
>
> (define_cpu_unit "cortex_a53_slot0" "cortex_a53")
> (define_cpu_unit "cortex_a53_slot1" "cortex_a53")
> +(final_presence_set "cortex_a53_slot1" "cortex_a53_slot0")
>
> (define_reservation "cortex_a53_slot_any"
> "cortex_a53_slot0\
> @@ -71,41 +72,43 @@
>
> (define_insn_reservation "cortex_a53_shift" 2
> (and (eq_attr "tune" "cortexa53")
> - (eq_attr "type" "adr,shift_imm,shift_reg,mov_imm,mvn_imm"))
> + (eq_attr "type" "adr,shift_imm,mov_imm,mvn_imm,mov_shift"))
> "cortex_a53_slot_any")
>
> -(define_insn_reservation "cortex_a53_alu_rotate_imm" 2
> +(define_insn_reservation "cortex_a53_shift_reg" 2
> (and (eq_attr "tune" "cortexa53")
> - (eq_attr "type" "rotate_imm"))
> - "(cortex_a53_slot1)
> - | (cortex_a53_single_issue)")
> + (eq_attr "type" "shift_reg,mov_shift_reg"))
> + "cortex_a53_slot_any+cortex_a53_hazard")
>
> (define_insn_reservation "cortex_a53_alu" 3
> (and (eq_attr "tune" "cortexa53")
> (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,
> alu_sreg,alus_sreg,logic_reg,logics_reg,
> adc_imm,adcs_imm,adc_reg,adcs_reg,
> - bfm,csel,clz,rbit,rev,alu_dsp_reg,
> - mov_reg,mvn_reg,
> - mrs,multiple,no_insn"))
> + csel,clz,rbit,rev,alu_dsp_reg,
> + mov_reg,mvn_reg,mrs,multiple,no_insn"))
> "cortex_a53_slot_any")
>
> (define_insn_reservation "cortex_a53_alu_shift" 3
> (and (eq_attr "tune" "cortexa53")
> (eq_attr "type" "alu_shift_imm,alus_shift_imm,
> crc,logic_shift_imm,logics_shift_imm,
> - alu_ext,alus_ext,
> - extend,mov_shift,mvn_shift"))
> + alu_ext,alus_ext,bfm,extend,mvn_shift"))
> "cortex_a53_slot_any")
>
> (define_insn_reservation "cortex_a53_alu_shift_reg" 3
> (and (eq_attr "tune" "cortexa53")
> (eq_attr "type" "alu_shift_reg,alus_shift_reg,
> logic_shift_reg,logics_shift_reg,
> - mov_shift_reg,mvn_shift_reg"))
> + mvn_shift_reg"))
> "cortex_a53_slot_any+cortex_a53_hazard")
>
> -(define_insn_reservation "cortex_a53_mul" 3
> +(define_insn_reservation "cortex_a53_alu_extr" 3
> + (and (eq_attr "tune" "cortexa53")
> + (eq_attr "type" "rotate_imm"))
> + "cortex_a53_slot1|cortex_a53_single_issue")
> +
> +(define_insn_reservation "cortex_a53_mul" 4
> (and (eq_attr "tune" "cortexa53")
> (ior (eq_attr "mul32" "yes")
> (eq_attr "mul64" "yes")))
> @@ -189,49 +192,43 @@
> (define_insn_reservation "cortex_a53_branch" 0
> (and (eq_attr "tune" "cortexa53")
> (eq_attr "type" "branch,call"))
> - "cortex_a53_slot_any,cortex_a53_branch")
> + "cortex_a53_slot_any+cortex_a53_branch")
>
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> ;; General-purpose register bypasses
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>
> -;; Model bypasses for unshifted operands to ALU instructions.
> +;; Model bypasses for ALU to ALU instructions.
>
> -(define_bypass 1 "cortex_a53_shift"
> - "cortex_a53_shift")
> +(define_bypass 0 "cortex_a53_shift*"
> + "cortex_a53_alu")
>
> -(define_bypass 1 "cortex_a53_alu,
> - cortex_a53_alu_shift*,
> - cortex_a53_alu_rotate_imm,
> - cortex_a53_shift"
> +(define_bypass 1 "cortex_a53_shift*"
> + "cortex_a53_shift*,cortex_a53_alu_*")
> +
> +(define_bypass 1 "cortex_a53_alu*"
> "cortex_a53_alu")
>
> -(define_bypass 2 "cortex_a53_alu,
> - cortex_a53_alu_shift*"
> +(define_bypass 1 "cortex_a53_alu*"
> "cortex_a53_alu_shift*"
> "aarch_forward_to_shift_is_not_shifted_reg")
>
> -;; In our model, we allow any general-purpose register operation to
> -;; bypass to the accumulator operand of an integer MADD-like operation.
> +(define_bypass 2 "cortex_a53_alu*"
> + "cortex_a53_alu_*,cortex_a53_shift*")
>
> -(define_bypass 1 "cortex_a53_alu*,
> - cortex_a53_load*,
> - cortex_a53_mul"
> +;; Model a bypass from MUL/MLA to MLA instructions.
> +
> +(define_bypass 1 "cortex_a53_mul"
> "cortex_a53_mul"
> "aarch_accumulator_forwarding")
>
> -;; Model a bypass from MLA/MUL to many ALU instructions.
> +;; Model a bypass from MUL/MLA to ALU instructions.
>
> (define_bypass 2 "cortex_a53_mul"
> - "cortex_a53_alu,
> - cortex_a53_alu_shift*")
> -
> -;; We get neater schedules by allowing an MLA/MUL to feed an
> -;; early load address dependency to a load.
> + "cortex_a53_alu")
>
> -(define_bypass 2 "cortex_a53_mul"
> - "cortex_a53_load*"
> - "arm_early_load_addr_dep")
> +(define_bypass 3 "cortex_a53_mul"
> + "cortex_a53_alu_*,cortex_a53_shift*")
>
> ;; Model bypasses for loads which are to be consumed by the ALU.
>
> @@ -239,47 +236,37 @@
> "cortex_a53_alu")
>
> (define_bypass 3 "cortex_a53_load1"
> - "cortex_a53_alu_shift*")
> + "cortex_a53_alu_*,cortex_a53_shift*")
> +
> +(define_bypass 3 "cortex_a53_load2"
> + "cortex_a53_alu")
>
> ;; Model a bypass for ALU instructions feeding stores.
>
> -(define_bypass 1 "cortex_a53_alu*"
> - "cortex_a53_store1,
> - cortex_a53_store2,
> - cortex_a53_store3plus"
> +(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
> + "cortex_a53_store*"
> "arm_no_early_store_addr_dep")
>
> ;; Model a bypass for load and multiply instructions feeding stores.
>
> -(define_bypass 2 "cortex_a53_mul,
> - cortex_a53_load1,
> - cortex_a53_load2,
> - cortex_a53_load3plus"
> - "cortex_a53_store1,
> - cortex_a53_store2,
> - cortex_a53_store3plus"
> +(define_bypass 1 "cortex_a53_mul,
> + cortex_a53_load*"
> + "cortex_a53_store*"
> "arm_no_early_store_addr_dep")
>
> ;; Model a GP->FP register move as similar to stores.
>
> -(define_bypass 1 "cortex_a53_alu*"
> +(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
> "cortex_a53_r2f")
>
> -(define_bypass 2 "cortex_a53_mul,
> - cortex_a53_load1,
> - cortex_a53_load2,
> - cortex_a53_load3plus"
> +(define_bypass 1 "cortex_a53_mul,
> + cortex_a53_load*"
> "cortex_a53_r2f")
>
> -;; Shifts feeding Load/Store addresses may not be ready in time.
> +;; Model flag forwarding to branches.
>
> -(define_bypass 3 "cortex_a53_shift"
> - "cortex_a53_load*"
> - "arm_early_load_addr_dep")
> -
> -(define_bypass 3 "cortex_a53_shift"
> - "cortex_a53_store*"
> - "arm_early_store_addr_dep")
> +(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
> + "cortex_a53_branch")
>
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> ;; Floating-point/Advanced SIMD.
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2016-07-06 8:41 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-05 15:00 [PATCH][AArch64] Improve Cortex-A53 integer scheduler Wilco Dijkstra
2016-07-06 8:41 ` Richard Earnshaw (lists)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).