[PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.
@ 2024-02-27  8:52 Jiawei
  2024-03-19  2:54 ` Jeff Law
  0 siblings, 1 reply; 11+ messages in thread
From: Jiawei @ 2024-02-27  8:52 UTC (permalink / raw)
  To: gcc-patches
  Cc: kito.cheng, palmer, jlaw, christoph.muellner, wuwei2016, shihua,
	shiyulong, chenyixuan, Chen Jiawei

From: Chen Jiawei <jiawei@iscas.ac.cn>

Co-Authored by: Lin Jiawei <jiawei.lin@epfl.ch>

This patch add XiangShan Nanhu cpu microarchitecture,
Nanhu is a 6-issue, superscalar, out-of-order processor.
More details see: https://xiangshan-doc.readthedocs.io/zh-cn/latest/arch

gcc/ChangeLog:

        * config/riscv/riscv-cores.def (RISCV_TUNE): New def.
        (RISCV_CORE): Ditto.
        * config/riscv/riscv-opts.h (enum
        * riscv_microarchitecture_type): New option.
        * config/riscv/riscv.cc: New def.
        * config/riscv/riscv.md: New include.
        * config/riscv/xiangshan.md: New file.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/mcpu-xiangshan-nanhu.c: New test.

---
 gcc/config/riscv/riscv-cores.def              |   6 +
 gcc/config/riscv/riscv-opts.h                 |   1 +
 gcc/config/riscv/riscv.cc                     |  17 ++
 gcc/config/riscv/riscv.md                     |   3 +-
 gcc/config/riscv/xiangshan.md                 | 148 ++++++++++++++++++
 .../gcc.target/riscv/mcpu-xiangshan-nanhu.c   |  34 ++++
 6 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/riscv/xiangshan.md
 create mode 100644 gcc/testsuite/gcc.target/riscv/mcpu-xiangshan-nanhu.c

diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def
index 57928bccdc8..ab23bb7a856 100644
--- a/gcc/config/riscv/riscv-cores.def
+++ b/gcc/config/riscv/riscv-cores.def
@@ -40,6 +40,7 @@ RISCV_TUNE("sifive-7-series", sifive_7, sifive_7_tune_info)
 RISCV_TUNE("sifive-p400-series", sifive_p400, sifive_p400_tune_info)
 RISCV_TUNE("sifive-p600-series", sifive_p600, sifive_p600_tune_info)
 RISCV_TUNE("thead-c906", generic, thead_c906_tune_info)
+RISCV_TUNE("xiangshan-nanhu", xiangshan, xiangshan_nanhu_tune_info)
 RISCV_TUNE("generic-ooo", generic_ooo, generic_ooo_tune_info)
 RISCV_TUNE("size", generic, optimize_size_tune_info)
 
@@ -90,4 +91,9 @@ RISCV_CORE("thead-c906",      "rv64imafdc_xtheadba_xtheadbb_xtheadbs_xtheadcmo_"
 			      "xtheadcondmov_xtheadfmemidx_xtheadmac_"
 			      "xtheadmemidx_xtheadmempair_xtheadsync",
 			      "thead-c906")
+
+RISCV_CORE("xiangshan-nanhu",      "rv64imafdc_zba_zbb_zbc_zbs_"
+			      "zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_"
+			      "svinval_zicbom_zicboz",
+			      "xiangshan-nanhu")
 #undef RISCV_CORE
diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 4edddbadc37..31f9bffa9b6 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -57,6 +57,7 @@ enum riscv_microarchitecture_type {
   sifive_7,
   sifive_p400,
   sifive_p600,
+  xiangshan,
   generic_ooo
 };
 extern enum riscv_microarchitecture_type riscv_microarchitecture;
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..aa53e25ae03 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -498,6 +498,23 @@ static const struct riscv_tune_param thead_c906_tune_info = {
   NULL,						/* vector cost */
 };
 
+/* Costs to use when optimizing for xiangshan nanhu.  */
+static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
+  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_add */
+  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_mul */
+  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},	/* fp_div */
+  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* int_mul */
+  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},	/* int_div */
+  6,						/* issue_rate */
+  3,						/* branch_cost */
+  3,						/* memory_cost */
+  3,						/* fmv_cost */
+  true,						/* slow_unaligned_access */
+  false,					/* use_divmod_expansion */
+  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
+  NULL,						/* vector cost */
+};
+
 /* Costs to use when optimizing for a generic ooo profile.  */
 static const struct riscv_tune_param generic_ooo_tune_info = {
   {COSTS_N_INSNS (2), COSTS_N_INSNS (2)},	/* fp_add */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 1fec13092e2..8aafe19ab51 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -685,7 +685,7 @@
 ;; Microarchitectures we know how to tune for.
 ;; Keep this in sync with enum riscv_microarchitecture.
 (define_attr "tune"
-  "generic,sifive_7,sifive_p400,sifive_p600,generic_ooo"
+  "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,generic_ooo"
   (const (symbol_ref "((enum attr_tune) riscv_microarchitecture)")))
 
 ;; Describe a user's asm statement.
@@ -3859,3 +3859,4 @@
 (include "sfb.md")
 (include "zc.md")
 (include "corev.md")
+(include "xiangshan.md")
diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md
new file mode 100644
index 00000000000..3ec89bf828e
--- /dev/null
+++ b/gcc/config/riscv/xiangshan.md
@@ -0,0 +1,148 @@
+;; Scheduling description for XiangShan Nanhu.
+
+;; Nanhu is a 6-issue, superscalar, out-of-order processor.
+
+;; -----------------------------------------------------
+;; Nanhu Core units
+;; 1*jmp + 4*alu + 2*mdu + 4*fma + 2*fmisc + 2*ld + 2*st
+;; -----------------------------------------------------
+
+(define_automaton "xiangshan")
+
+(define_cpu_unit "xs_jmp" "xiangshan")
+(define_cpu_unit "xs_i2f" "xiangshan")
+(define_reservation "xs_jmp_rs" "xs_jmp | xs_i2f")
+
+(define_cpu_unit "xs_alu_0, xs_alu_1, xs_alu_2, xs_alu_3" "xiangshan")
+(define_reservation "xs_alu_rs"
+  "xs_alu_0 | xs_alu_1 | xs_alu_2 | xs_alu_3")
+
+(define_cpu_unit "xs_mul_0, xs_mul_1" "xiangshan")
+(define_cpu_unit "xs_div_0, xs_div_1" "xiangshan")
+(define_reservation "xs_mdu_rs"
+  "(xs_mul_0 + xs_div_0) | (xs_mul_1 + xs_div_1)")
+
+(define_cpu_unit "xs_fadd_0, xs_fadd_1, xs_fadd_2, xs_fadd_3" "xiangshan")
+(define_cpu_unit "xs_fmul_0, xs_fmul_1, xs_fmul_2, xs_fmul_3" "xiangshan")
+(define_reservation "xs_fma_0" "xs_fadd_0 + xs_fmul_0")
+(define_reservation "xs_fma_1" "xs_fadd_1 + xs_fmul_1")
+(define_reservation "xs_fma_2" "xs_fadd_2 + xs_fmul_2")
+(define_reservation "xs_fma_3" "xs_fadd_3 + xs_fmul_3")
+
+(define_cpu_unit "xs_f2f_0, xs_f2f_1" "xiangshan")
+(define_cpu_unit "xs_f2i_0, xs_f2i_1" "xiangshan")
+(define_cpu_unit "xs_fdiv_0, xs_fdiv_1" "xiangshan")
+(define_reservation "xs_fmisc_rs"
+  "(xs_f2f_0 + xs_f2i_0 + xs_fdiv_0) | (xs_f2f_1 + xs_f2i_1 + xs_fdiv_1)")
+
+(define_cpu_unit "xs_ld_0, xs_ld_1" "xiangshan")
+(define_cpu_unit "xs_st_0, xs_st_1" "xiangshan")
+(define_reservation "xs_ld_rs" "xs_ld_0 | xs_ld_1")
+(define_reservation "xs_st_rs" "xs_st_0 | xs_st_1")
+
+;; ----------------------------------------------------
+;; Memory (load/store)
+;; ----------------------------------------------------
+
+(define_insn_reservation "xiangshan_load" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "load"))
+  "xs_ld_rs")
+
+(define_insn_reservation "xiangshan_fpload" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fpload"))
+  "xs_ld_rs")
+
+(define_insn_reservation "xiangshan_store" 1
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "store"))
+  "xs_st_rs")
+
+(define_insn_reservation "xiangshan_fpstore" 1
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fpstore"))
+  "xs_st_rs")
+
+;; ----------------------------------------------------
+;; Int
+;; ----------------------------------------------------
+
+(define_insn_reservation "xiangshan_jump" 1
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "jump,call,auipc,unknown"))
+  "xs_jmp_rs")
+
+(define_insn_reservation "xiangshan_i2f" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "mtc"))
+  "xs_jmp_rs")
+
+(define_insn_reservation "xiangshan_mul" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "imul"))
+  "xs_mdu_rs")
+
+(define_insn_reservation "xiangshan_div" 21
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "idiv"))
+  "xs_mdu_rs")
+
+(define_insn_reservation "xiangshan_alu" 1
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "nop,const,branch,arith,shift,slt,multi,logical,move,bitmanip,unknown"))
+  "xs_alu_rs")
+
+;; ----------------------------------------------------
+;; Float
+;; ----------------------------------------------------
+
+
+(define_insn_reservation "xiangshan_fma" 5
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fmadd"))
+  "xs_fma_0 | xs_fma_1 | xs_fma_2 | xs_fma_3")
+
+(define_insn_reservation "xiangshan_fadd" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fadd"))
+  "xs_fadd_0 | xs_fadd_1 | xs_fadd_2 | xs_fadd_3")
+
+(define_insn_reservation "xiangshan_fmul" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fmul"))
+  "xs_fmul_0 | xs_fmul_1 | xs_fmul_2 | xs_fmul_3")
+
+(define_insn_reservation "xiangshan_f2f" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fcvt,fmove"))
+  "xs_fmisc_rs")
+
+(define_insn_reservation "xiangshan_f2i" 3
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "mfc,fcmp"))
+  "xs_fmisc_rs")
+
+(define_insn_reservation "xiangshan_sfdiv" 11
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fdiv")
+       (eq_attr "mode" "SF"))
+  "xs_fmisc_rs")
+
+(define_insn_reservation "xiangshan_sfsqrt" 17
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fsqrt")
+       (eq_attr "mode" "SF"))
+  "xs_fmisc_rs")
+
+(define_insn_reservation "xiangshan_dfdiv" 21
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fdiv")
+       (eq_attr "mode" "DF"))
+  "xs_fmisc_rs")
+
+(define_insn_reservation "xiangshan_dfsqrt" 37
+  (and (eq_attr "tune" "xiangshan")
+       (eq_attr "type" "fsqrt")
+       (eq_attr "mode" "DF"))
+  "xs_fmisc_rs")
diff --git a/gcc/testsuite/gcc.target/riscv/mcpu-xiangshan-nanhu.c b/gcc/testsuite/gcc.target/riscv/mcpu-xiangshan-nanhu.c
new file mode 100644
index 00000000000..2903c88d91c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/mcpu-xiangshan-nanhu.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-skip-if "-march given" { *-*-* } { "-march=*" } } */
+/* { dg-options "-mcpu=xiangshan-nanhu" { target { rv64 } } } */
+/* XiangShan Nanhu => rv64imafdc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd
+                      _zkne_zknh_zksed_zksh_svinval_zicbom_zicboz */
+
+#if !((__riscv_xlen == 64)		\
+      && !defined(__riscv_32e)		\
+      && defined(__riscv_mul)		\
+      && defined(__riscv_atomic)	\
+      && (__riscv_flen == 64)		\
+      && defined(__riscv_compressed)	\
+      && defined(__riscv_zicbom)	\
+      && defined(__riscv_zicboz)	\
+      && defined(__riscv_zba)	\
+      && defined(__riscv_zbb)	\
+      && defined(__riscv_zbc)	\
+      && defined(__riscv_zbs)	\
+      && defined(__riscv_zbkb)	\
+      && defined(__riscv_zbkc)	\
+      && defined(__riscv_zbkx)	\
+      && defined(__riscv_zknd)	\
+      && defined(__riscv_zkne)	\
+      && defined(__riscv_zknh)	\
+      && defined(__riscv_zksed)	\
+      && defined(__riscv_zksh)	\
+      && defined(__riscv_svinval))
+#error "unexpected arch"
+#endif
+
+int main()
+{
+  return 0;
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.
  2024-02-27  8:52 [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture Jiawei
@ 2024-03-19  2:54 ` Jeff Law
  2024-03-19 12:43   ` jiawei
  2024-03-25 19:48   ` TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.) Xi Ruoyao
  0 siblings, 2 replies; 11+ messages in thread
From: Jeff Law @ 2024-03-19  2:54 UTC (permalink / raw)
  To: Jiawei, gcc-patches
  Cc: kito.cheng, palmer, christoph.muellner, wuwei2016, shihua,
	shiyulong, chenyixuan



On 2/27/24 1:52 AM, Jiawei wrote:
> From: Chen Jiawei <jiawei@iscas.ac.cn>
> 
> Co-Authored by: Lin Jiawei <jiawei.lin@epfl.ch>
> 
> This patch add XiangShan Nanhu cpu microarchitecture,
> Nanhu is a 6-issue, superscalar, out-of-order processor.
> More details see: https://xiangshan-doc.readthedocs.io/zh-cn/latest/arch
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-cores.def (RISCV_TUNE): New def.
>          (RISCV_CORE): Ditto.
>          * config/riscv/riscv-opts.h (enum
>          * riscv_microarchitecture_type): New option.
>          * config/riscv/riscv.cc: New def.
>          * config/riscv/riscv.md: New include.
>          * config/riscv/xiangshan.md: New file.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/mcpu-xiangshan-nanhu.c: New test.
As was discussed last Tuesday, this should be safe, even at this late 
stage in the gcc-14 cycle.

>   
> +/* Costs to use when optimizing for xiangshan nanhu.  */
> +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_add */
> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_mul */
> +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},	/* fp_div */
> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* int_mul */
> +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},	/* int_div */
> +  6,						/* issue_rate */
> +  3,						/* branch_cost */
> +  3,						/* memory_cost */
> +  3,						/* fmv_cost */
> +  true,						/* slow_unaligned_access */
> +  false,					/* use_divmod_expansion */
> +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
> +  NULL,						/* vector cost */
Is your integer division really that fast?  The table above essentially 
says that your cpu can do integer division in 6 cycles.

> +
> +(define_insn_reservation "xiangshan_mul" 3
> +  (and (eq_attr "tune" "xiangshan")
> +       (eq_attr "type" "imul"))
> +  "xs_mdu_rs")
> +
> +(define_insn_reservation "xiangshan_div" 21
> +  (and (eq_attr "tune" "xiangshan")
> +       (eq_attr "type" "idiv"))
> +  "xs_mdu_rs")
Whereas your pipeline description says it's 21c.

I strongly suspect you want to increase the cost of the int_div in the 
tuning table.  And with a the higher cost you probably want to turn on 
use_divmod_expansion.

I'll also note that your scheduler description also indicates your 
division is fully pipelined.  Is that correct?  if not, you'll want to 
adjust that reservation.



> +
> +(define_insn_reservation "xiangshan_sfdiv" 11
> +  (and (eq_attr "tune" "xiangshan")
> +       (eq_attr "type" "fdiv")
> +       (eq_attr "mode" "SF"))
> +  "xs_fmisc_rs")
> +
> +(define_insn_reservation "xiangshan_sfsqrt" 17
> +  (and (eq_attr "tune" "xiangshan")
> +       (eq_attr "type" "fsqrt")
> +       (eq_attr "mode" "SF"))
> +  "xs_fmisc_rs")
> +
> +(define_insn_reservation "xiangshan_dfdiv" 21
> +  (and (eq_attr "tune" "xiangshan")
> +       (eq_attr "type" "fdiv")
> +       (eq_attr "mode" "DF"))
> +  "xs_fmisc_rs")
> +
> +(define_insn_reservation "xiangshan_dfsqrt" 37
> +  (and (eq_attr "tune" "xiangshan")
> +       (eq_attr "type" "fsqrt")
> +       (eq_attr "mode" "DF"))
> +  "xs_fmisc_rs")
Similarly these say your fpdiv and fpsqrt are fully pipelined.  It's 
certainly possible, but I suspect it's really just an oversight.  Given 
these values you may also want to adjust the cost of an fp division in 
the cost table.


Finally with such high values for for the div/sqrt units, we find that 
the DFA "blows up" causing genattrtab to run for a very long time. We'll 
have to keep an eye on that.

And just to be clear, I think these can be done as a followup patch. I'm 
going to push this patch as-is rather than make any adjustments -- you 
almost certainly know the processor's capabilities better than myself or 
anyone else on this list :-)


Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.
  2024-03-19  2:54 ` Jeff Law
@ 2024-03-19 12:43   ` jiawei
  2024-03-25 19:48   ` TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.) Xi Ruoyao
  1 sibling, 0 replies; 11+ messages in thread
From: jiawei @ 2024-03-19 12:43 UTC (permalink / raw)
  To: Jeff Law
  Cc: gcc-patches, kito.cheng, palmer, christoph.muellner, wuwei2016,
	shihua, shiyulong, chenyixuan




&gt; -----原始邮件-----
&gt; 发件人: "Jeff Law" <jlaw@ventanamicro.com>
&gt; 发送时间: 2024-03-19 10:54:09 (星期二)
&gt; 收件人: Jiawei <jiawei@iscas.ac.cn>, gcc-patches@gcc.gnu.org
&gt; 抄送: kito.cheng@sifive.com, palmer@dabbelt.com, christoph.muellner@vrull.eu, wuwei2016@iscas.ac.cn, shihua@iscas.ac.cn, shiyulong@iscas.ac.cn, chenyixuan@iscas.ac.cn
&gt; 主题: Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.
&gt; 
&gt; 
&gt; 
&gt; On 2/27/24 1:52 AM, Jiawei wrote:
&gt; &gt; From: Chen Jiawei <jiawei@iscas.ac.cn>
&gt; &gt; 
&gt; &gt; Co-Authored by: Lin Jiawei <jiawei.lin@epfl.ch>
&gt; &gt; 
&gt; &gt; This patch add XiangShan Nanhu cpu microarchitecture,
&gt; &gt; Nanhu is a 6-issue, superscalar, out-of-order processor.
&gt; &gt; More details see: https://xiangshan-doc.readthedocs.io/zh-cn/latest/arch
&gt; &gt; 
&gt; &gt; gcc/ChangeLog:
&gt; &gt; 
&gt; &gt;          * config/riscv/riscv-cores.def (RISCV_TUNE): New def.
&gt; &gt;          (RISCV_CORE): Ditto.
&gt; &gt;          * config/riscv/riscv-opts.h (enum
&gt; &gt;          * riscv_microarchitecture_type): New option.
&gt; &gt;          * config/riscv/riscv.cc: New def.
&gt; &gt;          * config/riscv/riscv.md: New include.
&gt; &gt;          * config/riscv/xiangshan.md: New file.
&gt; &gt; 
&gt; &gt; gcc/testsuite/ChangeLog:
&gt; &gt; 
&gt; &gt;          * gcc.target/riscv/mcpu-xiangshan-nanhu.c: New test.
&gt; As was discussed last Tuesday, this should be safe, even at this late 
&gt; stage in the gcc-14 cycle.
&gt; 
&gt; &gt;   
&gt; &gt; +/* Costs to use when optimizing for xiangshan nanhu.  */
&gt; &gt; +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
&gt; &gt; +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_add */
&gt; &gt; +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_mul */
&gt; &gt; +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},	/* fp_div */
&gt; &gt; +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* int_mul */
&gt; &gt; +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},	/* int_div */
&gt; &gt; +  6,						/* issue_rate */
&gt; &gt; +  3,						/* branch_cost */
&gt; &gt; +  3,						/* memory_cost */
&gt; &gt; +  3,						/* fmv_cost */
&gt; &gt; +  true,						/* slow_unaligned_access */
&gt; &gt; +  false,					/* use_divmod_expansion */
&gt; &gt; +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
&gt; &gt; +  NULL,						/* vector cost */
&gt; Is your integer division really that fast?  The table above essentially 
&gt; says that your cpu can do integer division in 6 cycles.
&gt; 
&gt; &gt; +
&gt; &gt; +(define_insn_reservation "xiangshan_mul" 3
&gt; &gt; +  (and (eq_attr "tune" "xiangshan")
&gt; &gt; +       (eq_attr "type" "imul"))
&gt; &gt; +  "xs_mdu_rs")
&gt; &gt; +
&gt; &gt; +(define_insn_reservation "xiangshan_div" 21
&gt; &gt; +  (and (eq_attr "tune" "xiangshan")
&gt; &gt; +       (eq_attr "type" "idiv"))
&gt; &gt; +  "xs_mdu_rs")
&gt; Whereas your pipeline description says it's 21c.
&gt; 
&gt; I strongly suspect you want to increase the cost of the int_div in the 
&gt; tuning table.  And with a the higher cost you probably want to turn on 
&gt; use_divmod_expansion.
&gt; 
&gt; I'll also note that your scheduler description also indicates your 
&gt; division is fully pipelined.  Is that correct?  if not, you'll want to 
&gt; adjust that reservation.
&gt; 
&gt; 
&gt; 
&gt; &gt; +
&gt; &gt; +(define_insn_reservation "xiangshan_sfdiv" 11
&gt; &gt; +  (and (eq_attr "tune" "xiangshan")
&gt; &gt; +       (eq_attr "type" "fdiv")
&gt; &gt; +       (eq_attr "mode" "SF"))
&gt; &gt; +  "xs_fmisc_rs")
&gt; &gt; +
&gt; &gt; +(define_insn_reservation "xiangshan_sfsqrt" 17
&gt; &gt; +  (and (eq_attr "tune" "xiangshan")
&gt; &gt; +       (eq_attr "type" "fsqrt")
&gt; &gt; +       (eq_attr "mode" "SF"))
&gt; &gt; +  "xs_fmisc_rs")
&gt; &gt; +
&gt; &gt; +(define_insn_reservation "xiangshan_dfdiv" 21
&gt; &gt; +  (and (eq_attr "tune" "xiangshan")
&gt; &gt; +       (eq_attr "type" "fdiv")
&gt; &gt; +       (eq_attr "mode" "DF"))
&gt; &gt; +  "xs_fmisc_rs")
&gt; &gt; +
&gt; &gt; +(define_insn_reservation "xiangshan_dfsqrt" 37
&gt; &gt; +  (and (eq_attr "tune" "xiangshan")
&gt; &gt; +       (eq_attr "type" "fsqrt")
&gt; &gt; +       (eq_attr "mode" "DF"))
&gt; &gt; +  "xs_fmisc_rs")
&gt; Similarly these say your fpdiv and fpsqrt are fully pipelined.  It's 
&gt; certainly possible, but I suspect it's really just an oversight.  Given 
&gt; these values you may also want to adjust the cost of an fp division in 
&gt; the cost table.
&gt; 
&gt; 
&gt; Finally with such high values for for the div/sqrt units, we find that 
&gt; the DFA "blows up" causing genattrtab to run for a very long time. We'll 
&gt; have to keep an eye on that.
&gt; 
&gt; And just to be clear, I think these can be done as a followup patch. I'm 
&gt; going to push this patch as-is rather than make any adjustments -- you 
&gt; almost certainly know the processor's capabilities better than myself or 
&gt; anyone else on this list :-)
&gt; 
&gt; 
&gt; Jeff

Thank you for the comment, some pipeline processing costs may still need to
 be confirmed, and I will correct them in next patch.

BR,
Jiawei</jiawei.lin@epfl.ch></jiawei@iscas.ac.cn></jiawei@iscas.ac.cn></jlaw@ventanamicro.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-19  2:54 ` Jeff Law
  2024-03-19 12:43   ` jiawei
@ 2024-03-25 19:48   ` Xi Ruoyao
  2024-03-25 19:59     ` Jeff Law
  1 sibling, 1 reply; 11+ messages in thread
From: Xi Ruoyao @ 2024-03-25 19:48 UTC (permalink / raw)
  To: Jeff Law, Jiawei, gcc-patches
  Cc: kito.cheng, palmer, christoph.muellner, wuwei2016, shihua,
	shiyulong, chenyixuan

On Mon, 2024-03-18 at 20:54 -0600, Jeff Law wrote:
> > +/* Costs to use when optimizing for xiangshan nanhu.  */
> > +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
> > +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_add */
> > +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_mul */
> > +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},	/* fp_div */
> > +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* int_mul */
> > +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},	/* int_div */
> > +  6,						/* issue_rate */
> > +  3,						/* branch_cost */
> > +  3,						/* memory_cost */
> > +  3,						/* fmv_cost */
> > +  true,						/* slow_unaligned_access */
> > +  false,					/* use_divmod_expansion */
> > +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
> > +  NULL,						/* vector cost */

> Is your integer division really that fast?  The table above essentially 
> says that your cpu can do integer division in 6 cycles.

Hmm, I just seen I've coded some even smaller value for LoongArch CPUs
so forgive me for "hijacking" this thread...

The problem seems integer division may spend different number of cycles
for different inputs: on LoongArch LA664 I've observed 5 cycles for some
inputs and 39 cycles for other inputs.

So should we use the minimal value, the maximum value, or something in-
between for TARGET_RTX_COSTS and pipeline descriptions?

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 19:48   ` TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.) Xi Ruoyao
@ 2024-03-25 19:59     ` Jeff Law
  2024-03-25 20:13       ` Palmer Dabbelt
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Law @ 2024-03-25 19:59 UTC (permalink / raw)
  To: Xi Ruoyao, Jiawei, gcc-patches
  Cc: kito.cheng, palmer, christoph.muellner, wuwei2016, shihua,
	shiyulong, chenyixuan



On 3/25/24 1:48 PM, Xi Ruoyao wrote:
> On Mon, 2024-03-18 at 20:54 -0600, Jeff Law wrote:
>>> +/* Costs to use when optimizing for xiangshan nanhu.  */
>>> +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_add */
>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_mul */
>>> +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},	/* fp_div */
>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* int_mul */
>>> +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},	/* int_div */
>>> +  6,						/* issue_rate */
>>> +  3,						/* branch_cost */
>>> +  3,						/* memory_cost */
>>> +  3,						/* fmv_cost */
>>> +  true,						/* slow_unaligned_access */
>>> +  false,					/* use_divmod_expansion */
>>> +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
>>> +  NULL,						/* vector cost */
> 
>> Is your integer division really that fast?  The table above essentially
>> says that your cpu can do integer division in 6 cycles.
> 
> Hmm, I just seen I've coded some even smaller value for LoongArch CPUs
> so forgive me for "hijacking" this thread...
> 
> The problem seems integer division may spend different number of cycles
> for different inputs: on LoongArch LA664 I've observed 5 cycles for some
> inputs and 39 cycles for other inputs.
> 
> So should we use the minimal value, the maximum value, or something in-
> between for TARGET_RTX_COSTS and pipeline descriptions?
Yea, early outs are relatively common in the actual hardware 
implementation.

The biggest reason to refine the cost of a division is so that we've got 
a reasonably accurate cost for division by a constant -- which can often 
be done with multiplication by reciprocal sequence.  The multiplication 
by reciprocal sequence will use mult, add, sub, shadd insns and you need 
a reasonable cost model for those so you can compare against the cost of 
a hardware division.

So to answer your question.  Choose something sensible, you probably 
don't want the fastest case and you may not want the slowest case.

Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 19:59     ` Jeff Law
@ 2024-03-25 20:13       ` Palmer Dabbelt
  2024-03-25 20:27         ` Jeff Law
  0 siblings, 1 reply; 11+ messages in thread
From: Palmer Dabbelt @ 2024-03-25 20:13 UTC (permalink / raw)
  To: Jeff Law
  Cc: xry111, jiawei, gcc-patches, kito.cheng, christoph.muellner,
	wuwei2016, shihua, shiyulong, chenyixuan

On Mon, 25 Mar 2024 12:59:14 PDT (-0700), Jeff Law wrote:
>
>
> On 3/25/24 1:48 PM, Xi Ruoyao wrote:
>> On Mon, 2024-03-18 at 20:54 -0600, Jeff Law wrote:
>>>> +/* Costs to use when optimizing for xiangshan nanhu.  */
>>>> +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_add */
>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* fp_mul */
>>>> +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},	/* fp_div */
>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},	/* int_mul */
>>>> +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},	/* int_div */
>>>> +  6,						/* issue_rate */
>>>> +  3,						/* branch_cost */
>>>> +  3,						/* memory_cost */
>>>> +  3,						/* fmv_cost */
>>>> +  true,						/* slow_unaligned_access */
>>>> +  false,					/* use_divmod_expansion */
>>>> +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
>>>> +  NULL,						/* vector cost */
>>
>>> Is your integer division really that fast?  The table above essentially
>>> says that your cpu can do integer division in 6 cycles.
>>
>> Hmm, I just seen I've coded some even smaller value for LoongArch CPUs
>> so forgive me for "hijacking" this thread...
>>
>> The problem seems integer division may spend different number of cycles
>> for different inputs: on LoongArch LA664 I've observed 5 cycles for some
>> inputs and 39 cycles for other inputs.
>>
>> So should we use the minimal value, the maximum value, or something in-
>> between for TARGET_RTX_COSTS and pipeline descriptions?
> Yea, early outs are relatively common in the actual hardware
> implementation.
>
> The biggest reason to refine the cost of a division is so that we've got
> a reasonably accurate cost for division by a constant -- which can often
> be done with multiplication by reciprocal sequence.  The multiplication
> by reciprocal sequence will use mult, add, sub, shadd insns and you need
> a reasonable cost model for those so you can compare against the cost of
> a hardware division.
>
> So to answer your question.  Choose something sensible, you probably
> don't want the fastest case and you may not want the slowest case.

Maybe we should have some sort of per-bit-set cost hook for mul/div?  
Without that we're kind of just guessing at whether the implmentation 
has early outs based on hueristics used to implicitly generate the cost 
models.

Not sure that's really worth the complexity, though...

> Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 20:13       ` Palmer Dabbelt
@ 2024-03-25 20:27         ` Jeff Law
  2024-03-25 20:31           ` Palmer Dabbelt
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Law @ 2024-03-25 20:27 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: xry111, jiawei, gcc-patches, kito.cheng, christoph.muellner,
	wuwei2016, shihua, shiyulong, chenyixuan



On 3/25/24 2:13 PM, Palmer Dabbelt wrote:
> On Mon, 25 Mar 2024 12:59:14 PDT (-0700), Jeff Law wrote:
>>
>>
>> On 3/25/24 1:48 PM, Xi Ruoyao wrote:
>>> On Mon, 2024-03-18 at 20:54 -0600, Jeff Law wrote:
>>>>> +/* Costs to use when optimizing for xiangshan nanhu.  */
>>>>> +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
>>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},    /* fp_add */
>>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},    /* fp_mul */
>>>>> +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},    /* fp_div */
>>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},    /* int_mul */
>>>>> +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},    /* int_div */
>>>>> +  6,                        /* issue_rate */
>>>>> +  3,                        /* branch_cost */
>>>>> +  3,                        /* memory_cost */
>>>>> +  3,                        /* fmv_cost */
>>>>> +  true,                        /* slow_unaligned_access */
>>>>> +  false,                    /* use_divmod_expansion */
>>>>> +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
>>>>> +  NULL,                        /* vector cost */
>>>
>>>> Is your integer division really that fast?  The table above essentially
>>>> says that your cpu can do integer division in 6 cycles.
>>>
>>> Hmm, I just seen I've coded some even smaller value for LoongArch CPUs
>>> so forgive me for "hijacking" this thread...
>>>
>>> The problem seems integer division may spend different number of cycles
>>> for different inputs: on LoongArch LA664 I've observed 5 cycles for some
>>> inputs and 39 cycles for other inputs.
>>>
>>> So should we use the minimal value, the maximum value, or something in-
>>> between for TARGET_RTX_COSTS and pipeline descriptions?
>> Yea, early outs are relatively common in the actual hardware
>> implementation.
>>
>> The biggest reason to refine the cost of a division is so that we've got
>> a reasonably accurate cost for division by a constant -- which can often
>> be done with multiplication by reciprocal sequence.  The multiplication
>> by reciprocal sequence will use mult, add, sub, shadd insns and you need
>> a reasonable cost model for those so you can compare against the cost of
>> a hardware division.
>>
>> So to answer your question.  Choose something sensible, you probably
>> don't want the fastest case and you may not want the slowest case.
> 
> Maybe we should have some sort of per-bit-set cost hook for mul/div? 
> Without that we're kind of just guessing at whether the implmentation 
> has early outs based on hueristics used to implicitly generate the cost 
> models.
> 
> Not sure that's really worth the complexity, though...
I'd doubt it's worth the complexity.  Picking some reasonable value gets 
you the vast majority of the benefit.   Something like
COSTS_N_INSNS(6) is enough to get CSE to trigger.  So what's left is a 
reasonable cost, particularly for the division-by-constant case where we 
need a ceiling for synth_mult.

Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 20:27         ` Jeff Law
@ 2024-03-25 20:31           ` Palmer Dabbelt
  2024-03-25 20:49             ` Jeff Law
  0 siblings, 1 reply; 11+ messages in thread
From: Palmer Dabbelt @ 2024-03-25 20:31 UTC (permalink / raw)
  To: Jeff Law
  Cc: xry111, jiawei, gcc-patches, kito.cheng, christoph.muellner,
	wuwei2016, shihua, shiyulong, chenyixuan

On Mon, 25 Mar 2024 13:27:34 PDT (-0700), Jeff Law wrote:
>
>
> On 3/25/24 2:13 PM, Palmer Dabbelt wrote:
>> On Mon, 25 Mar 2024 12:59:14 PDT (-0700), Jeff Law wrote:
>>>
>>>
>>> On 3/25/24 1:48 PM, Xi Ruoyao wrote:
>>>> On Mon, 2024-03-18 at 20:54 -0600, Jeff Law wrote:
>>>>>> +/* Costs to use when optimizing for xiangshan nanhu.  */
>>>>>> +static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
>>>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},    /* fp_add */
>>>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},    /* fp_mul */
>>>>>> +  {COSTS_N_INSNS (10), COSTS_N_INSNS (20)},    /* fp_div */
>>>>>> +  {COSTS_N_INSNS (3), COSTS_N_INSNS (3)},    /* int_mul */
>>>>>> +  {COSTS_N_INSNS (6), COSTS_N_INSNS (6)},    /* int_div */
>>>>>> +  6,                        /* issue_rate */
>>>>>> +  3,                        /* branch_cost */
>>>>>> +  3,                        /* memory_cost */
>>>>>> +  3,                        /* fmv_cost */
>>>>>> +  true,                        /* slow_unaligned_access */
>>>>>> +  false,                    /* use_divmod_expansion */
>>>>>> +  RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH,          /* fusible_ops */
>>>>>> +  NULL,                        /* vector cost */
>>>>
>>>>> Is your integer division really that fast?  The table above essentially
>>>>> says that your cpu can do integer division in 6 cycles.
>>>>
>>>> Hmm, I just seen I've coded some even smaller value for LoongArch CPUs
>>>> so forgive me for "hijacking" this thread...
>>>>
>>>> The problem seems integer division may spend different number of cycles
>>>> for different inputs: on LoongArch LA664 I've observed 5 cycles for some
>>>> inputs and 39 cycles for other inputs.
>>>>
>>>> So should we use the minimal value, the maximum value, or something in-
>>>> between for TARGET_RTX_COSTS and pipeline descriptions?
>>> Yea, early outs are relatively common in the actual hardware
>>> implementation.
>>>
>>> The biggest reason to refine the cost of a division is so that we've got
>>> a reasonably accurate cost for division by a constant -- which can often
>>> be done with multiplication by reciprocal sequence.  The multiplication
>>> by reciprocal sequence will use mult, add, sub, shadd insns and you need
>>> a reasonable cost model for those so you can compare against the cost of
>>> a hardware division.
>>>
>>> So to answer your question.  Choose something sensible, you probably
>>> don't want the fastest case and you may not want the slowest case.
>>
>> Maybe we should have some sort of per-bit-set cost hook for mul/div?
>> Without that we're kind of just guessing at whether the implmentation
>> has early outs based on hueristics used to implicitly generate the cost
>> models.
>>
>> Not sure that's really worth the complexity, though...
> I'd doubt it's worth the complexity.  Picking some reasonable value gets
> you the vast majority of the benefit.   Something like
> COSTS_N_INSNS(6) is enough to get CSE to trigger.  So what's left is a
> reasonable cost, particularly for the division-by-constant case where we
> need a ceiling for synth_mult.

Ya, makes sense.  I noticed our multi-word multiply costs are a bit odd 
too (they really only work for 64-bit mul on 32-bit targets), but that's 
probably not worth worrying about either.

>
> Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 20:31           ` Palmer Dabbelt
@ 2024-03-25 20:49             ` Jeff Law
  2024-03-25 20:57               ` Palmer Dabbelt
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Law @ 2024-03-25 20:49 UTC (permalink / raw)
  To: gcc-patches



On 3/25/24 2:31 PM, Palmer Dabbelt wrote:
> On Mon, 25 Mar 2024 13:27:34 PDT (-0700), Jeff Law wrote:

>> I'd doubt it's worth the complexity.  Picking some reasonable value gets
>> you the vast majority of the benefit.   Something like
>> COSTS_N_INSNS(6) is enough to get CSE to trigger.  So what's left is a
>> reasonable cost, particularly for the division-by-constant case where we
>> need a ceiling for synth_mult.
> 
> Ya, makes sense.  I noticed our multi-word multiply costs are a bit odd 
> too (they really only work for 64-bit mul on 32-bit targets), but that's 
> probably not worth worrying about either.
We do have a changes locally that adjust various costs.  One of which is 
highpart multiply.  One of the many things to start working through once 
gcc-15 opens for development.  Hence my desire to help keep gcc-14 on 
track for an on-time release.

Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 20:49             ` Jeff Law
@ 2024-03-25 20:57               ` Palmer Dabbelt
  2024-03-25 21:41                 ` Jeff Law
  0 siblings, 1 reply; 11+ messages in thread
From: Palmer Dabbelt @ 2024-03-25 20:57 UTC (permalink / raw)
  To: jeffreyalaw; +Cc: gcc-patches

On Mon, 25 Mar 2024 13:49:18 PDT (-0700), jeffreyalaw@gmail.com wrote:
>
>
> On 3/25/24 2:31 PM, Palmer Dabbelt wrote:
>> On Mon, 25 Mar 2024 13:27:34 PDT (-0700), Jeff Law wrote:
>
>>> I'd doubt it's worth the complexity.  Picking some reasonable value gets
>>> you the vast majority of the benefit.   Something like
>>> COSTS_N_INSNS(6) is enough to get CSE to trigger.  So what's left is a
>>> reasonable cost, particularly for the division-by-constant case where we
>>> need a ceiling for synth_mult.
>>
>> Ya, makes sense.  I noticed our multi-word multiply costs are a bit odd
>> too (they really only work for 64-bit mul on 32-bit targets), but that's
>> probably not worth worrying about either.
> We do have a changes locally that adjust various costs.  One of which is
> highpart multiply.  One of the many things to start working through once
> gcc-15 opens for development.  Hence my desire to help keep gcc-14 on
> track for an on-time release.

Cool.  LMK if there's anything we can do to help on that front.

>
> Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.)
  2024-03-25 20:57               ` Palmer Dabbelt
@ 2024-03-25 21:41                 ` Jeff Law
  0 siblings, 0 replies; 11+ messages in thread
From: Jeff Law @ 2024-03-25 21:41 UTC (permalink / raw)
  To: Palmer Dabbelt; +Cc: gcc-patches



On 3/25/24 2:57 PM, Palmer Dabbelt wrote:
> On Mon, 25 Mar 2024 13:49:18 PDT (-0700), jeffreyalaw@gmail.com wrote:
>>
>>
>> On 3/25/24 2:31 PM, Palmer Dabbelt wrote:
>>> On Mon, 25 Mar 2024 13:27:34 PDT (-0700), Jeff Law wrote:
>>
>>>> I'd doubt it's worth the complexity.  Picking some reasonable value 
>>>> gets
>>>> you the vast majority of the benefit.   Something like
>>>> COSTS_N_INSNS(6) is enough to get CSE to trigger.  So what's left is a
>>>> reasonable cost, particularly for the division-by-constant case 
>>>> where we
>>>> need a ceiling for synth_mult.
>>>
>>> Ya, makes sense.  I noticed our multi-word multiply costs are a bit odd
>>> too (they really only work for 64-bit mul on 32-bit targets), but that's
>>> probably not worth worrying about either.
>> We do have a changes locally that adjust various costs.  One of which is
>> highpart multiply.  One of the many things to start working through once
>> gcc-15 opens for development.  Hence my desire to help keep gcc-14 on
>> track for an on-time release.
> 
> Cool.  LMK if there's anything we can do to help on that front.
I think the RISC-V space is in pretty good shape.   Most of the issues 
left are either generic or hitting other targets.  While the number of 
P1s has been flat or rising, that's more an artifact of bug 
triage/reprioritization process that's ongoing.  I can only speak for 
myself, but the progress in nailing down the slew of bugs thrown into 
the P1 bucket over the last few weeks has been great IMHO.

jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-03-25 21:41 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-27  8:52 [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture Jiawei
2024-03-19  2:54 ` Jeff Law
2024-03-19 12:43   ` jiawei
2024-03-25 19:48   ` TARGET_RTX_COSTS and pipeline latency vs. variable-latency instructions (was Re: [PATCH] RISC-V: Add XiangShan Nanhu microarchitecture.) Xi Ruoyao
2024-03-25 19:59     ` Jeff Law
2024-03-25 20:13       ` Palmer Dabbelt
2024-03-25 20:27         ` Jeff Law
2024-03-25 20:31           ` Palmer Dabbelt
2024-03-25 20:49             ` Jeff Law
2024-03-25 20:57               ` Palmer Dabbelt
2024-03-25 21:41                 ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).