On Mon, 24 Oct 2022, Jan Hubička wrote: > > By the way, it appears pre-existing znver[123] models are also causing > > some kind > > of combinatorial blow-up, but before znver4 it was not a blocking issue: > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87832 > > > It is really easy to make DFA size to grow if there are possibly many > instructions in the pipeline (as every possible state of a modelled pipeline > needs to be a new state of the automaton). This is essentially > depth_of_pipeline * number_of_units with additional states to repesent > special instructions and this naturally keeps growing. > > We could try to break the FP automata into multiple ones, but there are > instructions that can go down any pipe which makes this hard > or we can try toreduce number of different reservation types (possibly by > breaking the automaton to znver1-3 and 4 or so). > With znver2 model I experimented with broken up version and common one and > ended up with smaller binary for combined one. Looking at znver1.md again, I think the problem is caused by incorrect modeling of division instructions: they have descriptions like (define_insn_reservation "znver1_idiv_DI" 41 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "DI") (eq_attr "memory" "none")))) "znver1-double,znver1-ieu2*41") which says that DImode idiv has latency 41 (which is correct) and that it occupies 2nd integer execution unit for 41 consecutive cycles, but that is not correct: 1) the division instruction is partially pipelined, and has throughput 1/14 2) for the most part it occupies a separate division unit, not the general arithmetic unit. (incidentally, I think the blowup is caused by interaction of such super-long 41-cycle paths with the rest of reservations) I think we should fix this by modeling the separate division unit properly, and fixing reservations to use the measured reciprocal throughput of those instructions (available from uops.info). The following patch does that for integer divisions and completely eliminates the integer part of the problem; the issue with floating-point divisions remains. Top 5 znver table sizes, before: 68692 r znver1_ieu_check 68692 r znver1_ieu_transitions 99792 r znver1_ieu_min_issue_delay 428108 r znver1_fp_min_issue_delay 856216 r znver1_fp_transitions After: 1454 r znver1_ieu_translate 1454 r znver1_translate 2304 r znver1_ieu_transitions 428108 r znver1_fp_min_issue_delay 856216 r znver1_fp_transitions Will you help getting this reviewed for trunk? diff --git a/gcc/config/i386/znver1.md b/gcc/config/i386/znver1.md index 9c25b4e27..39b59343d 100644 --- a/gcc/config/i386/znver1.md +++ b/gcc/config/i386/znver1.md @@ -24,7 +24,7 @@ ;; AMD znver1, znver2 and znver3 Scheduling ;; Modeling automatons for zen decoders, integer execution pipes, ;; AGU pipes and floating point execution units. -(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu") +(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv") ;; Decoders unit has 4 decoders and all of them can decode fast path ;; and vector type instructions. @@ -50,6 +50,7 @@ (define_cpu_unit "znver1-ieu1" "znver1_ieu") (define_cpu_unit "znver1-ieu2" "znver1_ieu") (define_cpu_unit "znver1-ieu3" "znver1_ieu") +(define_cpu_unit "znver1-idiv" "znver1_idiv") (define_reservation "znver1-ieu" "znver1-ieu0|znver1-ieu1|znver1-ieu2|znver1-ieu3") ;; 2 AGU pipes in znver1 and 3 AGU pipes in znver2 and znver3 @@ -176,28 +177,28 @@ (and (eq_attr "type" "idiv") (and (eq_attr "mode" "DI") (eq_attr "memory" "none")))) - "znver1-double,znver1-ieu2*41") + "znver1-double,znver1-idiv*14") (define_insn_reservation "znver1_idiv_SI" 25 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "SI") (eq_attr "memory" "none")))) - "znver1-double,znver1-ieu2*25") + "znver1-double,znver1-idiv*14") (define_insn_reservation "znver1_idiv_HI" 17 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "HI") (eq_attr "memory" "none")))) - "znver1-double,znver1-ieu2*17") + "znver1-double,znver1-idiv*14") (define_insn_reservation "znver1_idiv_QI" 12 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "QI") (eq_attr "memory" "none")))) - "znver1-direct,znver1-ieu2*12") + "znver1-direct,znver1-idiv*13") ;; Mem operands (define_insn_reservation "znver1_idiv_mem_DI" 45 @@ -205,84 +206,84 @@ (and (eq_attr "type" "idiv") (and (eq_attr "mode" "DI") (eq_attr "memory" "none")))) - "znver1-double,znver1-load,znver1-ieu2*41") + "znver1-double,znver1-load,znver1-idiv*14") (define_insn_reservation "znver1_idiv_mem_SI" 29 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "SI") (eq_attr "memory" "none")))) - "znver1-double,znver1-load,znver1-ieu2*25") + "znver1-double,znver1-load,znver1-idiv*14") (define_insn_reservation "znver1_idiv_mem_HI" 21 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "HI") (eq_attr "memory" "none")))) - "znver1-double,znver1-load,znver1-ieu2*17") + "znver1-double,znver1-load,znver1-idiv*14") (define_insn_reservation "znver1_idiv_mem_QI" 16 (and (eq_attr "cpu" "znver1,znver2") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "QI") (eq_attr "memory" "none")))) - "znver1-direct,znver1-load,znver1-ieu2*12") + "znver1-direct,znver1-load,znver1-idiv*13") (define_insn_reservation "znver3_idiv_DI" 18 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "DI") (eq_attr "memory" "none")))) - "znver1-double,znver1-ieu2*18") + "znver1-double,znver1-idiv*7") (define_insn_reservation "znver3_idiv_SI" 12 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "SI") (eq_attr "memory" "none")))) - "znver1-double,znver1-ieu2*12") + "znver1-double,znver1-idiv*6") (define_insn_reservation "znver3_idiv_HI" 10 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "HI") (eq_attr "memory" "none")))) - "znver1-double,znver1-ieu2*10") + "znver1-double,znver1-idiv*4") (define_insn_reservation "znver3_idiv_QI" 9 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "QI") (eq_attr "memory" "none")))) - "znver1-direct,znver1-ieu2*9") + "znver1-direct,znver1-idiv*4") (define_insn_reservation "znver3_idiv_mem_DI" 22 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "DI") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-ieu2*22") + "znver1-double,znver1-load,znver1-idiv*7") (define_insn_reservation "znver3_idiv_mem_SI" 16 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "SI") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-ieu2*16") + "znver1-double,znver1-load,znver1-idiv*6") (define_insn_reservation "znver3_idiv_mem_HI" 14 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "HI") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-ieu2*10") + "znver1-double,znver1-load,znver1-idiv*4") (define_insn_reservation "znver3_idiv_mem_QI" 13 (and (eq_attr "cpu" "znver3") (and (eq_attr "type" "idiv") (and (eq_attr "mode" "QI") (eq_attr "memory" "load")))) - "znver1-direct,znver1-load,znver1-ieu2*9") + "znver1-direct,znver1-load,znver1-idiv*4") ;; STR ISHIFT which are micro coded. ;; Fix me: Latency need to be rechecked.