public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [RFC] [aarch64] Add HiSilicon tsv110 CPU support
@ 2018-05-22  8:41 Shaokun Zhang
  2018-05-22  8:43 ` Shaokun Zhang
  2018-05-22 10:53 ` Ramana Radhakrishnan
  0 siblings, 2 replies; 12+ messages in thread
From: Shaokun Zhang @ 2018-05-22  8:41 UTC (permalink / raw)
  To: gcc-patches; +Cc: Shaokun Zhang

tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
L1 Icache which can access L1 Dcache.
Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
tsv110, is there any good idea to skip DC CVAU operation for tsv110.

Any thoughts and ideas are welcome.

Shaokun Zhang (1):
  [aarch64] Add HiSilicon tsv110 CPU support.

 gcc/ChangeLog                            |   9 +++
 gcc/config/aarch64/aarch64-cores.def     |   5 ++
 gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64-tune.md       |   2 +-
 gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
 gcc/doc/invoke.texi                      |   2 +-
 6 files changed, 198 insertions(+), 2 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC] [aarch64] Add HiSilicon tsv110 CPU support.
  2018-05-22  8:41 [RFC] [aarch64] Add HiSilicon tsv110 CPU support Shaokun Zhang
@ 2018-05-22  8:43 ` Shaokun Zhang
  2018-05-22 11:23   ` Kyrill Tkachov
  2018-05-22 10:53 ` Ramana Radhakrishnan
  1 sibling, 1 reply; 12+ messages in thread
From: Shaokun Zhang @ 2018-05-22  8:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: Shaokun Zhang

This patch adds HiSilicon's an mcpu: tsv110.

---
 gcc/ChangeLog                            |   9 +++
 gcc/config/aarch64/aarch64-cores.def     |   5 ++
 gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64-tune.md       |   2 +-
 gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
 gcc/doc/invoke.texi                      |   2 +-
 6 files changed, 198 insertions(+), 2 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index cec2892..5d44966 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,12 @@
+2018-05-22  Shaokun Zhang  <zhangshaokun@hisilicon.com>
+            Bo Zhou  <zbo.zhou@hisilicon.com>
+
+	* config/aarch64/aarch64-cores.def (tsv110): New CPU.
+	* config/aarch64/aarch64-tune.md: Regenerated.
+	* doc/invoke.texi (AArch61 Options/-mtune): Add "tsv110".
+	* gcc/config/aarch64/aarch64.c (tsv110_tunings): New tuning table.
+	* gcc/config/aarch64/aarch64-cost-tables.h: Add "tsv110" extra costs.
+
 2018-05-21  Michael Meissner  <meissner@linux.ibm.com>
 
 	PR target/85657
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 33b96ca..db7a412 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -91,6 +91,11 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2
 /* Qualcomm ('Q') cores. */
 AARCH64_CORE("saphira",     saphira,    falkor,    8_3A,  AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
 
+/* ARMv8.4-A Architecture Processors.  */
+
+/* HiSilicon ('H') cores. */
+AARCH64_CORE("tsv110",     tsv110,    tsv110,    8_4A,  AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110,   0x48, 0xd01, -1)
+
 /* ARMv8-A big.LITTLE implementations.  */
 
 AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index a455c62..b6890d6 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -334,4 +334,107 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   }
 };
 
+const struct cpu_cost_table tsv110_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    COSTS_N_INSNS (1), /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,	               /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (2),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (2),       /* extend_add.  */
+      COSTS_N_INSNS (11)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (3),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (3),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (4),         /* loadv.  */
+    COSTS_N_INSNS (4)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (10),      /* div.  */
+      COSTS_N_INSNS (4),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (4),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (1),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (2),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (17),      /* div.  */
+      COSTS_N_INSNS (4),       /* mult.  */
+      COSTS_N_INSNS (6),       /* mult_addsub.  */
+      COSTS_N_INSNS (6),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      COSTS_N_INSTS (1),       /* neg.  */
+      COSTS_N_INSTS (1),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (2),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)  /* alu.  */
+  }
+};
+
 #endif
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 7b3a746..a10f2e7 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
+	"cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,tsv110,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6bf6c05..0788c14 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -266,6 +266,22 @@ static const struct cpu_addrcost_table generic_addrcost_table =
   0 /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table tsv110_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* register_offset  */
+  1, /* register_sextend  */
+  1, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 {
     {
@@ -344,6 +360,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost tsv110_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  2, /* GP2FP  */
+  3, /* FP2GP  */
+  2  /* FP2FP  */
+};
+
 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 {
   1, /* GP2GP  */
@@ -450,6 +476,25 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   1 /* cond_not_taken_branch_cost  */
 };
 
+static const struct cpu_vector_cost tsv110_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* vec_int_stmt_cost  */
+  2, /* vec_fp_stmt_cost  */
+  2, /* vec_permute_cost  */
+  3, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  5, /* vec_align_load_cost  */
+  5, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1 /* cond_not_taken_branch_cost  */
+};
+
 static const struct cpu_vector_cost exynosm1_vector_cost =
 {
   1, /* scalar_int_stmt_cost  */
@@ -550,6 +595,15 @@ static const cpu_prefetch_tune generic_prefetch_tune =
   -1			/* default_opt_level  */
 };
 
+static const cpu_prefetch_tune tsv110_prefetch_tune =
+{
+  0,			/* num_slots  */
+  64,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  512,			/* l2_cache_size  */
+  -1			/* default_opt_level  */
+};
+
 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 {
   0,			/* num_slots  */
@@ -751,6 +805,31 @@ static const struct tune_params cortexa73_tunings =
 };
 
 
+static const struct tune_params tsv110_tunings =
+{
+  &tsv110_extra_costs,
+  &tsv110_addrcost_table,
+  &tsv110_regmove_cost,
+  &tsv110_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  4, /* memmov_cost  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
+   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
+  16,	/* function_align.  */
+  4,	/* jump_align.  */
+  8,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &tsv110_prefetch_tune
+};
 
 static const struct tune_params exynosm1_tunings =
 {
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index beba295..55fcd42 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14713,7 +14713,7 @@ performance of the code.  Permissible values for this option are:
 @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
 @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
 @samp{exynos-m1}, @samp{falkor}, @samp{qdf24xx}, @samp{saphira},
-@samp{xgene1}, @samp{vulcan}, @samp{thunderx},
+@samp{xgene1}, @samp{vulcan}, @samp{thunderx}, @samp{tsv110},
 @samp{thunderxt88}, @samp{thunderxt88p1}, @samp{thunderxt81},
 @samp{thunderxt83}, @samp{thunderx2t99}, @samp{cortex-a57.cortex-a53},
 @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35},
-- 
2.7.4

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support
  2018-05-22  8:41 [RFC] [aarch64] Add HiSilicon tsv110 CPU support Shaokun Zhang
  2018-05-22  8:43 ` Shaokun Zhang
@ 2018-05-22 10:53 ` Ramana Radhakrishnan
  2018-05-23  3:51   ` Zhangshaokun
  1 sibling, 1 reply; 12+ messages in thread
From: Ramana Radhakrishnan @ 2018-05-22 10:53 UTC (permalink / raw)
  To: Shaokun Zhang; +Cc: gcc-patches

On Tue, May 22, 2018 at 9:40 AM, Shaokun Zhang
<zhangshaokun@hisilicon.com> wrote:
> tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
> L1 Icache which can access L1 Dcache.
> Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
> tsv110, is there any good idea to skip DC CVAU operation for tsv110.

A solution would be to use an ifunc but on a cpu variant.

Is this really that important for performance and on what workloads ?

regards
Ramana

>
> Any thoughts and ideas are welcome.
>
> Shaokun Zhang (1):
>   [aarch64] Add HiSilicon tsv110 CPU support.
>
>  gcc/ChangeLog                            |   9 +++
>  gcc/config/aarch64/aarch64-cores.def     |   5 ++
>  gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>  gcc/config/aarch64/aarch64-tune.md       |   2 +-
>  gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>  gcc/doc/invoke.texi                      |   2 +-
>  6 files changed, 198 insertions(+), 2 deletions(-)
>
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support.
  2018-05-22  8:43 ` Shaokun Zhang
@ 2018-05-22 11:23   ` Kyrill Tkachov
  2018-05-23  5:34     ` Zhangshaokun
  0 siblings, 1 reply; 12+ messages in thread
From: Kyrill Tkachov @ 2018-05-22 11:23 UTC (permalink / raw)
  To: Shaokun Zhang, gcc-patches
  Cc: Marcus Shawcroft, Richard Earnshaw (lists), James Greenhalgh

Hi Shaokun,

On 22/05/18 09:40, Shaokun Zhang wrote:
> This patch adds HiSilicon's an mcpu: tsv110.
>
> ---
>  gcc/ChangeLog                            |   9 +++
>  gcc/config/aarch64/aarch64-cores.def     |   5 ++
>  gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>  gcc/config/aarch64/aarch64-tune.md       |   2 +-
>  gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>  gcc/doc/invoke.texi                      |   2 +-
>  6 files changed, 198 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index cec2892..5d44966 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,12 @@
> +2018-05-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
> +            Bo Zhou  <zbo.zhou@hisilicon.com>
> +
> +       * config/aarch64/aarch64-cores.def (tsv110): New CPU.
> +       * config/aarch64/aarch64-tune.md: Regenerated.
> +       * doc/invoke.texi (AArch61 Options/-mtune): Add "tsv110".

typo: AArch64.

> +       * gcc/config/aarch64/aarch64.c (tsv110_tunings): New tuning table.
> +       * gcc/config/aarch64/aarch64-cost-tables.h: Add "tsv110" extra costs.

Please start the path with config/.

> +
>  2018-05-21  Michael Meissner <meissner@linux.ibm.com>
>
>          PR target/85657
> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
> index 33b96ca..db7a412 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -91,6 +91,11 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2
>  /* Qualcomm ('Q') cores. */
>  AARCH64_CORE("saphira",     saphira,    falkor,    8_3A, AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
>
> +/* ARMv8.4-A Architecture Processors.  */
> +
> +/* HiSilicon ('H') cores. */
> +AARCH64_CORE("tsv110",     tsv110,    tsv110,    8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110,   0x48, 0xd01, -1)
> +

The third field is the scheduler model to use when optimising.
Since there is no tsv110 scheduling model, using the name "tsv110"
in the third field will generally give pretty poor schedules.
I recommend you specify an scheduling model that most closely matches your core
for the time being. But I don't think it's required and I wouldn't let it hold
up the patch.

You'll need approval from an aarch64 maintainer (cc'ed some for you).

Thanks,
Kyrill

>  /* ARMv8-A big.LITTLE implementations.  */
>
>  AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
> index a455c62..b6890d6 100644
> --- a/gcc/config/aarch64/aarch64-cost-tables.h
> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> @@ -334,4 +334,107 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
>    }
>  };
>
> +const struct cpu_cost_table tsv110_extra_costs =
> +{
> +  /* ALU */
> +  {
> +    0,                 /* arith.  */
> +    0,                 /* logical.  */
> +    0,                 /* shift.  */
> +    0,                 /* shift_reg.  */
> +    COSTS_N_INSNS (1), /* arith_shift.  */
> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> +    COSTS_N_INSNS (1), /* log_shift.  */
> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> +    0,                 /* extend.  */
> +    COSTS_N_INSNS (1), /* extend_arith.  */
> +    0,                 /* bfi.  */
> +    0,                 /* bfx.  */
> +    0,                 /* clz.  */
> +    0,                /* rev.  */
> +    0,                 /* non_exec.  */
> +    true               /* non_exec_costs_exec.  */
> +  },
> +  {
> +    /* MULT SImode */
> +    {
> +      COSTS_N_INSNS (2),       /* simple.  */
> +      COSTS_N_INSNS (2),       /* flag_setting.  */
> +      COSTS_N_INSNS (2),       /* extend.  */
> +      COSTS_N_INSNS (2),       /* add.  */
> +      COSTS_N_INSNS (2),       /* extend_add.  */
> +      COSTS_N_INSNS (11)       /* idiv.  */
> +    },
> +    /* MULT DImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      0,                       /* flag_setting (N/A).  */
> +      COSTS_N_INSNS (3),       /* extend.  */
> +      COSTS_N_INSNS (3),       /* add.  */
> +      COSTS_N_INSNS (3),       /* extend_add.  */
> +      COSTS_N_INSNS (19)       /* idiv.  */
> +    }
> +  },
> +  /* LD/ST */
> +  {
> +    COSTS_N_INSNS (3),         /* load.  */
> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
> +    COSTS_N_INSNS (3),         /* ldrd.  */
> +    COSTS_N_INSNS (3),         /* ldm_1st.  */
> +    1,                         /* ldm_regs_per_insn_1st. */
> +    2,                         /* ldm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (4),         /* loadf.  */
> +    COSTS_N_INSNS (4),         /* loadd.  */
> +    COSTS_N_INSNS (4),         /* load_unaligned.  */
> +    0,                         /* store.  */
> +    0,                         /* strd.  */
> +    0,                         /* stm_1st.  */
> +    1,                         /* stm_regs_per_insn_1st. */
> +    2,                         /* stm_regs_per_insn_subsequent.  */
> +    0,                         /* storef.  */
> +    0,                         /* stored.  */
> +    COSTS_N_INSNS (1),         /* store_unaligned.  */
> +    COSTS_N_INSNS (4),         /* loadv.  */
> +    COSTS_N_INSNS (4)          /* storev.  */
> +  },
> +  {
> +    /* FP SFmode */
> +    {
> +      COSTS_N_INSNS (10),      /* div.  */
> +      COSTS_N_INSNS (4),       /* mult.  */
> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
> +      COSTS_N_INSNS (4),       /* fma.  */
> +      COSTS_N_INSNS (4),       /* addsub.  */
> +      COSTS_N_INSNS (1),       /* fpconst.  */
> +      COSTS_N_INSNS (1),       /* neg.  */
> +      COSTS_N_INSNS (1),       /* compare.  */
> +      COSTS_N_INSNS (2),       /* widen.  */
> +      COSTS_N_INSNS (2),       /* narrow.  */
> +      COSTS_N_INSNS (2),       /* toint.  */
> +      COSTS_N_INSNS (1),       /* fromint.  */
> +      COSTS_N_INSNS (2)        /* roundint.  */
> +    },
> +    /* FP DFmode */
> +    {
> +      COSTS_N_INSNS (17),      /* div.  */
> +      COSTS_N_INSNS (4),       /* mult.  */
> +      COSTS_N_INSNS (6),       /* mult_addsub.  */
> +      COSTS_N_INSNS (6),       /* fma.  */
> +      COSTS_N_INSNS (3),       /* addsub.  */
> +      COSTS_N_INSNS (1),       /* fpconst.  */
> +      COSTS_N_INSTS (1),       /* neg.  */
> +      COSTS_N_INSTS (1),       /* compare.  */
> +      COSTS_N_INSNS (2),       /* widen.  */
> +      COSTS_N_INSNS (2),       /* narrow.  */
> +      COSTS_N_INSNS (2),       /* toint.  */
> +      COSTS_N_INSNS (1),       /* fromint.  */
> +      COSTS_N_INSNS (2)        /* roundint.  */
> +    }
> +  },
> +  /* Vector */
> +  {
> +    COSTS_N_INSNS (1)  /* alu.  */
> +  }
> +};
> +
>  #endif
> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
> index 7b3a746..a10f2e7 100644
> --- a/gcc/config/aarch64/aarch64-tune.md
> +++ b/gcc/config/aarch64/aarch64-tune.md
> @@ -1,5 +1,5 @@
>  ;; -*- buffer-read-only: t -*-
>  ;; Generated automatically by gentune.sh from aarch64-cores.def
>  (define_attr "tune"
> - "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
> + "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,tsv110,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>          (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 6bf6c05..0788c14 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -266,6 +266,22 @@ static const struct cpu_addrcost_table generic_addrcost_table =
>    0 /* imm_offset  */
>  };
>
> +static const struct cpu_addrcost_table tsv110_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* register_offset  */
> +  1, /* register_sextend  */
> +  1, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
>  static const struct cpu_addrcost_table exynosm1_addrcost_table =
>  {
>      {
> @@ -344,6 +360,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
>    2 /* FP2FP  */
>  };
>
> +static const struct cpu_regmove_cost tsv110_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  2, /* GP2FP  */
> +  3, /* FP2GP  */
> +  2  /* FP2FP  */
> +};
> +
>  static const struct cpu_regmove_cost exynosm1_regmove_cost =
>  {
>    1, /* GP2GP  */
> @@ -450,6 +476,25 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
>    1 /* cond_not_taken_branch_cost  */
>  };
>
> +static const struct cpu_vector_cost tsv110_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  5, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  2, /* vec_int_stmt_cost  */
> +  2, /* vec_fp_stmt_cost  */
> +  2, /* vec_permute_cost  */
> +  3, /* vec_to_scalar_cost  */
> +  2, /* scalar_to_vec_cost  */
> +  5, /* vec_align_load_cost  */
> +  5, /* vec_unalign_load_cost  */
> +  1, /* vec_unalign_store_cost  */
> +  1, /* vec_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1 /* cond_not_taken_branch_cost  */
> +};
> +
>  static const struct cpu_vector_cost exynosm1_vector_cost =
>  {
>    1, /* scalar_int_stmt_cost  */
> @@ -550,6 +595,15 @@ static const cpu_prefetch_tune generic_prefetch_tune =
>    -1                   /* default_opt_level  */
>  };
>
> +static const cpu_prefetch_tune tsv110_prefetch_tune =
> +{
> +  0,                   /* num_slots  */
> +  64,                  /* l1_cache_size  */
> +  64,                  /* l1_cache_line_size  */
> +  512,                 /* l2_cache_size  */
> +  -1                   /* default_opt_level  */
> +};
> +
>  static const cpu_prefetch_tune exynosm1_prefetch_tune =
>  {
>    0,                   /* num_slots  */
> @@ -751,6 +805,31 @@ static const struct tune_params cortexa73_tunings =
>  };
>
>
> +static const struct tune_params tsv110_tunings =
> +{
> +  &tsv110_extra_costs,
> +  &tsv110_addrcost_table,
> +  &tsv110_regmove_cost,
> +  &tsv110_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  4, /* memmov_cost  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
> +   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
> +  16,  /* function_align.  */
> +  4,   /* jump_align.  */
> +  8,   /* loop_align.  */
> +  2,   /* int_reassoc_width.  */
> +  4,   /* fp_reassoc_width.  */
> +  1,   /* vec_reassoc_width.  */
> +  2,   /* min_div_recip_mul_sf.  */
> +  2,   /* min_div_recip_mul_df.  */
> +  0,   /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
> +  &tsv110_prefetch_tune
> +};
>
>  static const struct tune_params exynosm1_tunings =
>  {
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index beba295..55fcd42 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -14713,7 +14713,7 @@ performance of the code. Permissible values for this option are:
>  @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
>  @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
>  @samp{exynos-m1}, @samp{falkor}, @samp{qdf24xx}, @samp{saphira},
> -@samp{xgene1}, @samp{vulcan}, @samp{thunderx},
> +@samp{xgene1}, @samp{vulcan}, @samp{thunderx}, @samp{tsv110},
>  @samp{thunderxt88}, @samp{thunderxt88p1}, @samp{thunderxt81},
>  @samp{thunderxt83}, @samp{thunderx2t99}, @samp{cortex-a57.cortex-a53},
>  @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35},
> -- 
> 2.7.4
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support
  2018-05-22 10:53 ` Ramana Radhakrishnan
@ 2018-05-23  3:51   ` Zhangshaokun
  2018-05-23 10:52     ` Ramana Radhakrishnan
  0 siblings, 1 reply; 12+ messages in thread
From: Zhangshaokun @ 2018-05-23  3:51 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: gcc-patches, Marcus Shawcroft, Richard Earnshaw (lists),
	James Greenhalgh, Kyrill Tkachov, felix.yang

Hi Ramana,

On 2018/5/22 18:28, Ramana Radhakrishnan wrote:
> On Tue, May 22, 2018 at 9:40 AM, Shaokun Zhang
> <zhangshaokun@hisilicon.com> wrote:
>> tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
>> L1 Icache which can access L1 Dcache.
>> Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
>> tsv110, is there any good idea to skip DC CVAU operation for tsv110.
> 
> A solution would be to use an ifunc but on a cpu variant.
> 

ifunc, can you give further explanation?
If on a cpu variant, for HiSilicon tsv110, we have two version and CPU variants
are 0 and 1. Both are expected to skip DC CVAU operation in sync icache and
dcache.

Hi ARM guys,
are you happy to share yours idea about this?

> Is this really that important for performance and on what workloads ?
> 

Since it is not necessary for sync icache and dcache, it is beneficial for
performance to skip the redundant DC CVAU and do IC IVAU only.
For JVM, __clear_cache is called many times.

Thanks,
Shaokun

> regards
> Ramana
> 
>>
>> Any thoughts and ideas are welcome.
>>
>> Shaokun Zhang (1):
>>   [aarch64] Add HiSilicon tsv110 CPU support.
>>
>>  gcc/ChangeLog                            |   9 +++
>>  gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>  gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>  gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>  gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>  gcc/doc/invoke.texi                      |   2 +-
>>  6 files changed, 198 insertions(+), 2 deletions(-)
>>
>> --
>> 2.7.4
>>
> 
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support.
  2018-05-22 11:23   ` Kyrill Tkachov
@ 2018-05-23  5:34     ` Zhangshaokun
  2018-05-23  8:26       ` Kyrill Tkachov
  0 siblings, 1 reply; 12+ messages in thread
From: Zhangshaokun @ 2018-05-23  5:34 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches
  Cc: Marcus Shawcroft, Richard Earnshaw (lists), James Greenhalgh, felix.yang


Hi Kyrill,

On 2018/5/22 18:52, Kyrill Tkachov wrote:
> Hi Shaokun,
> 
> On 22/05/18 09:40, Shaokun Zhang wrote:
>> This patch adds HiSilicon's an mcpu: tsv110.
>>
>> ---
>>  gcc/ChangeLog                            |   9 +++
>>  gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>  gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>  gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>  gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>  gcc/doc/invoke.texi                      |   2 +-
>>  6 files changed, 198 insertions(+), 2 deletions(-)
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index cec2892..5d44966 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,12 @@
>> +2018-05-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>> +            Bo Zhou  <zbo.zhou@hisilicon.com>
>> +
>> +       * config/aarch64/aarch64-cores.def (tsv110): New CPU.
>> +       * config/aarch64/aarch64-tune.md: Regenerated.
>> +       * doc/invoke.texi (AArch61 Options/-mtune): Add "tsv110".
> 
> typo: AArch64.
> 

Good catch, my mistake.

>> +       * gcc/config/aarch64/aarch64.c (tsv110_tunings): New tuning table.
>> +       * gcc/config/aarch64/aarch64-cost-tables.h: Add "tsv110" extra costs.
> 
> Please start the path with config/.
> 

Sure, Will remove gcc/ next version.

>> +
>>  2018-05-21  Michael Meissner <meissner@linux.ibm.com>
>>
>>          PR target/85657
>> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
>> index 33b96ca..db7a412 100644
>> --- a/gcc/config/aarch64/aarch64-cores.def
>> +++ b/gcc/config/aarch64/aarch64-cores.def
>> @@ -91,6 +91,11 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2
>>  /* Qualcomm ('Q') cores. */
>>  AARCH64_CORE("saphira",     saphira,    falkor,    8_3A, AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
>>
>> +/* ARMv8.4-A Architecture Processors.  */
>> +
>> +/* HiSilicon ('H') cores. */
>> +AARCH64_CORE("tsv110",     tsv110,    tsv110,    8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110,   0x48, 0xd01, -1)
>> +
> 
> The third field is the scheduler model to use when optimising.
> Since there is no tsv110 scheduling model, using the name "tsv110"
> in the third field will generally give pretty poor schedules.
> I recommend you specify an scheduling model that most closely matches your core
> for the time being. But I don't think it's required and I wouldn't let it hold

I checked it again, cortexa57 is most closely matches tsv110 and thanks your
suggestion.
If i choose cortexa57, can i add the tsv110_tunings which will use tsv110's
pipeline features, like the rest patch as follow or only use generic feature?

> up the patch.
> 
> You'll need approval from an aarch64 maintainer (cc'ed some for you).
> 

Good, thanks for your nice guidance.

Thanks,
Shaokun

> Thanks,
> Kyrill
> 
>>  /* ARMv8-A big.LITTLE implementations.  */
>>
>>  AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
>> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
>> index a455c62..b6890d6 100644
>> --- a/gcc/config/aarch64/aarch64-cost-tables.h
>> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
>> @@ -334,4 +334,107 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
>>    }
>>  };
>>
>> +const struct cpu_cost_table tsv110_extra_costs =
>> +{
>> +  /* ALU */
>> +  {
>> +    0,                 /* arith.  */
>> +    0,                 /* logical.  */
>> +    0,                 /* shift.  */
>> +    0,                 /* shift_reg.  */
>> +    COSTS_N_INSNS (1), /* arith_shift.  */
>> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
>> +    COSTS_N_INSNS (1), /* log_shift.  */
>> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
>> +    0,                 /* extend.  */
>> +    COSTS_N_INSNS (1), /* extend_arith.  */
>> +    0,                 /* bfi.  */
>> +    0,                 /* bfx.  */
>> +    0,                 /* clz.  */
>> +    0,                /* rev.  */
>> +    0,                 /* non_exec.  */
>> +    true               /* non_exec_costs_exec.  */
>> +  },
>> +  {
>> +    /* MULT SImode */
>> +    {
>> +      COSTS_N_INSNS (2),       /* simple.  */
>> +      COSTS_N_INSNS (2),       /* flag_setting.  */
>> +      COSTS_N_INSNS (2),       /* extend.  */
>> +      COSTS_N_INSNS (2),       /* add.  */
>> +      COSTS_N_INSNS (2),       /* extend_add.  */
>> +      COSTS_N_INSNS (11)       /* idiv.  */
>> +    },
>> +    /* MULT DImode */
>> +    {
>> +      COSTS_N_INSNS (3),       /* simple.  */
>> +      0,                       /* flag_setting (N/A).  */
>> +      COSTS_N_INSNS (3),       /* extend.  */
>> +      COSTS_N_INSNS (3),       /* add.  */
>> +      COSTS_N_INSNS (3),       /* extend_add.  */
>> +      COSTS_N_INSNS (19)       /* idiv.  */
>> +    }
>> +  },
>> +  /* LD/ST */
>> +  {
>> +    COSTS_N_INSNS (3),         /* load.  */
>> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
>> +    COSTS_N_INSNS (3),         /* ldrd.  */
>> +    COSTS_N_INSNS (3),         /* ldm_1st.  */
>> +    1,                         /* ldm_regs_per_insn_1st. */
>> +    2,                         /* ldm_regs_per_insn_subsequent.  */
>> +    COSTS_N_INSNS (4),         /* loadf.  */
>> +    COSTS_N_INSNS (4),         /* loadd.  */
>> +    COSTS_N_INSNS (4),         /* load_unaligned.  */
>> +    0,                         /* store.  */
>> +    0,                         /* strd.  */
>> +    0,                         /* stm_1st.  */
>> +    1,                         /* stm_regs_per_insn_1st. */
>> +    2,                         /* stm_regs_per_insn_subsequent.  */
>> +    0,                         /* storef.  */
>> +    0,                         /* stored.  */
>> +    COSTS_N_INSNS (1),         /* store_unaligned.  */
>> +    COSTS_N_INSNS (4),         /* loadv.  */
>> +    COSTS_N_INSNS (4)          /* storev.  */
>> +  },
>> +  {
>> +    /* FP SFmode */
>> +    {
>> +      COSTS_N_INSNS (10),      /* div.  */
>> +      COSTS_N_INSNS (4),       /* mult.  */
>> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
>> +      COSTS_N_INSNS (4),       /* fma.  */
>> +      COSTS_N_INSNS (4),       /* addsub.  */
>> +      COSTS_N_INSNS (1),       /* fpconst.  */
>> +      COSTS_N_INSNS (1),       /* neg.  */
>> +      COSTS_N_INSNS (1),       /* compare.  */
>> +      COSTS_N_INSNS (2),       /* widen.  */
>> +      COSTS_N_INSNS (2),       /* narrow.  */
>> +      COSTS_N_INSNS (2),       /* toint.  */
>> +      COSTS_N_INSNS (1),       /* fromint.  */
>> +      COSTS_N_INSNS (2)        /* roundint.  */
>> +    },
>> +    /* FP DFmode */
>> +    {
>> +      COSTS_N_INSNS (17),      /* div.  */
>> +      COSTS_N_INSNS (4),       /* mult.  */
>> +      COSTS_N_INSNS (6),       /* mult_addsub.  */
>> +      COSTS_N_INSNS (6),       /* fma.  */
>> +      COSTS_N_INSNS (3),       /* addsub.  */
>> +      COSTS_N_INSNS (1),       /* fpconst.  */
>> +      COSTS_N_INSTS (1),       /* neg.  */
>> +      COSTS_N_INSTS (1),       /* compare.  */
>> +      COSTS_N_INSNS (2),       /* widen.  */
>> +      COSTS_N_INSNS (2),       /* narrow.  */
>> +      COSTS_N_INSNS (2),       /* toint.  */
>> +      COSTS_N_INSNS (1),       /* fromint.  */
>> +      COSTS_N_INSNS (2)        /* roundint.  */
>> +    }
>> +  },
>> +  /* Vector */
>> +  {
>> +    COSTS_N_INSNS (1)  /* alu.  */
>> +  }
>> +};
>> +
>>  #endif
>> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
>> index 7b3a746..a10f2e7 100644
>> --- a/gcc/config/aarch64/aarch64-tune.md
>> +++ b/gcc/config/aarch64/aarch64-tune.md
>> @@ -1,5 +1,5 @@
>>  ;; -*- buffer-read-only: t -*-
>>  ;; Generated automatically by gentune.sh from aarch64-cores.def
>>  (define_attr "tune"
>> - "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>> + "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,tsv110,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>>          (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 6bf6c05..0788c14 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -266,6 +266,22 @@ static const struct cpu_addrcost_table generic_addrcost_table =
>>    0 /* imm_offset  */
>>  };
>>
>> +static const struct cpu_addrcost_table tsv110_addrcost_table =
>> +{
>> +    {
>> +      1, /* hi  */
>> +      0, /* si  */
>> +      0, /* di  */
>> +      1, /* ti  */
>> +    },
>> +  0, /* pre_modify  */
>> +  0, /* post_modify  */
>> +  0, /* register_offset  */
>> +  1, /* register_sextend  */
>> +  1, /* register_zextend  */
>> +  0 /* imm_offset  */
>> +};
>> +
>>  static const struct cpu_addrcost_table exynosm1_addrcost_table =
>>  {
>>      {
>> @@ -344,6 +360,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
>>    2 /* FP2FP  */
>>  };
>>
>> +static const struct cpu_regmove_cost tsv110_regmove_cost =
>> +{
>> +  1, /* GP2GP  */
>> +  /* Avoid the use of slow int<->fp moves for spilling by setting
>> +     their cost higher than memmov_cost.  */
>> +  2, /* GP2FP  */
>> +  3, /* FP2GP  */
>> +  2  /* FP2FP  */
>> +};
>> +
>>  static const struct cpu_regmove_cost exynosm1_regmove_cost =
>>  {
>>    1, /* GP2GP  */
>> @@ -450,6 +476,25 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
>>    1 /* cond_not_taken_branch_cost  */
>>  };
>>
>> +static const struct cpu_vector_cost tsv110_vector_cost =
>> +{
>> +  1, /* scalar_int_stmt_cost  */
>> +  1, /* scalar_fp_stmt_cost  */
>> +  5, /* scalar_load_cost  */
>> +  1, /* scalar_store_cost  */
>> +  2, /* vec_int_stmt_cost  */
>> +  2, /* vec_fp_stmt_cost  */
>> +  2, /* vec_permute_cost  */
>> +  3, /* vec_to_scalar_cost  */
>> +  2, /* scalar_to_vec_cost  */
>> +  5, /* vec_align_load_cost  */
>> +  5, /* vec_unalign_load_cost  */
>> +  1, /* vec_unalign_store_cost  */
>> +  1, /* vec_store_cost  */
>> +  1, /* cond_taken_branch_cost  */
>> +  1 /* cond_not_taken_branch_cost  */
>> +};
>> +
>>  static const struct cpu_vector_cost exynosm1_vector_cost =
>>  {
>>    1, /* scalar_int_stmt_cost  */
>> @@ -550,6 +595,15 @@ static const cpu_prefetch_tune generic_prefetch_tune =
>>    -1                   /* default_opt_level  */
>>  };
>>
>> +static const cpu_prefetch_tune tsv110_prefetch_tune =
>> +{
>> +  0,                   /* num_slots  */
>> +  64,                  /* l1_cache_size  */
>> +  64,                  /* l1_cache_line_size  */
>> +  512,                 /* l2_cache_size  */
>> +  -1                   /* default_opt_level  */
>> +};
>> +
>>  static const cpu_prefetch_tune exynosm1_prefetch_tune =
>>  {
>>    0,                   /* num_slots  */
>> @@ -751,6 +805,31 @@ static const struct tune_params cortexa73_tunings =
>>  };
>>
>>
>> +static const struct tune_params tsv110_tunings =
>> +{
>> +  &tsv110_extra_costs,
>> +  &tsv110_addrcost_table,
>> +  &tsv110_regmove_cost,
>> +  &tsv110_vector_cost,
>> +  &generic_branch_cost,
>> +  &generic_approx_modes,
>> +  4, /* memmov_cost  */
>> +  4, /* issue_rate  */
>> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
>> +   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
>> +  16,  /* function_align.  */
>> +  4,   /* jump_align.  */
>> +  8,   /* loop_align.  */
>> +  2,   /* int_reassoc_width.  */
>> +  4,   /* fp_reassoc_width.  */
>> +  1,   /* vec_reassoc_width.  */
>> +  2,   /* min_div_recip_mul_sf.  */
>> +  2,   /* min_div_recip_mul_df.  */
>> +  0,   /* max_case_values.  */
>> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
>> +  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
>> +  &tsv110_prefetch_tune
>> +};
>>
>>  static const struct tune_params exynosm1_tunings =
>>  {
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index beba295..55fcd42 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -14713,7 +14713,7 @@ performance of the code. Permissible values for this option are:
>>  @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
>>  @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
>>  @samp{exynos-m1}, @samp{falkor}, @samp{qdf24xx}, @samp{saphira},
>> -@samp{xgene1}, @samp{vulcan}, @samp{thunderx},
>> +@samp{xgene1}, @samp{vulcan}, @samp{thunderx}, @samp{tsv110},
>>  @samp{thunderxt88}, @samp{thunderxt88p1}, @samp{thunderxt81},
>>  @samp{thunderxt83}, @samp{thunderx2t99}, @samp{cortex-a57.cortex-a53},
>>  @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35},
>> -- 
>> 2.7.4
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support.
  2018-05-23  5:34     ` Zhangshaokun
@ 2018-05-23  8:26       ` Kyrill Tkachov
  2018-05-23  9:05         ` Zhangshaokun
  0 siblings, 1 reply; 12+ messages in thread
From: Kyrill Tkachov @ 2018-05-23  8:26 UTC (permalink / raw)
  To: Zhangshaokun, gcc-patches
  Cc: Marcus Shawcroft, Richard Earnshaw (lists), James Greenhalgh, felix.yang


On 23/05/18 05:54, Zhangshaokun wrote:
> Hi Kyrill,
>
> On 2018/5/22 18:52, Kyrill Tkachov wrote:
>> Hi Shaokun,
>>
>> On 22/05/18 09:40, Shaokun Zhang wrote:
>>> This patch adds HiSilicon's an mcpu: tsv110.
>>>
>>> ---
>>>   gcc/ChangeLog                            |   9 +++
>>>   gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>>   gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>>   gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>>   gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>>   gcc/doc/invoke.texi                      |   2 +-
>>>   6 files changed, 198 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>>> index cec2892..5d44966 100644
>>> --- a/gcc/ChangeLog
>>> +++ b/gcc/ChangeLog
>>> @@ -1,3 +1,12 @@
>>> +2018-05-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>>> +            Bo Zhou  <zbo.zhou@hisilicon.com>
>>> +
>>> +       * config/aarch64/aarch64-cores.def (tsv110): New CPU.
>>> +       * config/aarch64/aarch64-tune.md: Regenerated.
>>> +       * doc/invoke.texi (AArch61 Options/-mtune): Add "tsv110".
>> typo: AArch64.
>>
> Good catch, my mistake.
>
>>> +       * gcc/config/aarch64/aarch64.c (tsv110_tunings): New tuning table.
>>> +       * gcc/config/aarch64/aarch64-cost-tables.h: Add "tsv110" extra costs.
>> Please start the path with config/.
>>
> Sure, Will remove gcc/ next version.
>
>>> +
>>>   2018-05-21  Michael Meissner <meissner@linux.ibm.com>
>>>
>>>           PR target/85657
>>> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
>>> index 33b96ca..db7a412 100644
>>> --- a/gcc/config/aarch64/aarch64-cores.def
>>> +++ b/gcc/config/aarch64/aarch64-cores.def
>>> @@ -91,6 +91,11 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2
>>>   /* Qualcomm ('Q') cores. */
>>>   AARCH64_CORE("saphira",     saphira,    falkor,    8_3A, AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
>>>
>>> +/* ARMv8.4-A Architecture Processors.  */
>>> +
>>> +/* HiSilicon ('H') cores. */
>>> +AARCH64_CORE("tsv110",     tsv110,    tsv110,    8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110,   0x48, 0xd01, -1)
>>> +
>> The third field is the scheduler model to use when optimising.
>> Since there is no tsv110 scheduling model, using the name "tsv110"
>> in the third field will generally give pretty poor schedules.
>> I recommend you specify an scheduling model that most closely matches your core
>> for the time being. But I don't think it's required and I wouldn't let it hold
> I checked it again, cortexa57 is most closely matches tsv110 and thanks your
> suggestion.
> If i choose cortexa57, can i add the tsv110_tunings which will use tsv110's
> pipeline features, like the rest patch as follow or only use generic feature?

If you use cortexa57 for the scheduling model (the 3rd field) you should still
use tsv110_tunings in the 6th field as this will specify other important parameters
like instruction selection costs, fusion capabilities, alignment requirements etc.

Thanks,
Kyrill

>
>> up the patch.
>>
>> You'll need approval from an aarch64 maintainer (cc'ed some for you).
>>
> Good, thanks for your nice guidance.
>
> Thanks,
> Shaokun
>
>> Thanks,
>> Kyrill
>>
>>>   /* ARMv8-A big.LITTLE implementations.  */
>>>
>>>   AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
>>> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
>>> index a455c62..b6890d6 100644
>>> --- a/gcc/config/aarch64/aarch64-cost-tables.h
>>> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
>>> @@ -334,4 +334,107 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
>>>     }
>>>   };
>>>
>>> +const struct cpu_cost_table tsv110_extra_costs =
>>> +{
>>> +  /* ALU */
>>> +  {
>>> +    0,                 /* arith.  */
>>> +    0,                 /* logical.  */
>>> +    0,                 /* shift.  */
>>> +    0,                 /* shift_reg.  */
>>> +    COSTS_N_INSNS (1), /* arith_shift.  */
>>> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
>>> +    COSTS_N_INSNS (1), /* log_shift.  */
>>> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
>>> +    0,                 /* extend.  */
>>> +    COSTS_N_INSNS (1), /* extend_arith.  */
>>> +    0,                 /* bfi.  */
>>> +    0,                 /* bfx.  */
>>> +    0,                 /* clz.  */
>>> +    0,                /* rev.  */
>>> +    0,                 /* non_exec.  */
>>> +    true               /* non_exec_costs_exec.  */
>>> +  },
>>> +  {
>>> +    /* MULT SImode */
>>> +    {
>>> +      COSTS_N_INSNS (2),       /* simple.  */
>>> +      COSTS_N_INSNS (2),       /* flag_setting.  */
>>> +      COSTS_N_INSNS (2),       /* extend.  */
>>> +      COSTS_N_INSNS (2),       /* add.  */
>>> +      COSTS_N_INSNS (2),       /* extend_add.  */
>>> +      COSTS_N_INSNS (11)       /* idiv.  */
>>> +    },
>>> +    /* MULT DImode */
>>> +    {
>>> +      COSTS_N_INSNS (3),       /* simple.  */
>>> +      0,                       /* flag_setting (N/A).  */
>>> +      COSTS_N_INSNS (3),       /* extend.  */
>>> +      COSTS_N_INSNS (3),       /* add.  */
>>> +      COSTS_N_INSNS (3),       /* extend_add.  */
>>> +      COSTS_N_INSNS (19)       /* idiv.  */
>>> +    }
>>> +  },
>>> +  /* LD/ST */
>>> +  {
>>> +    COSTS_N_INSNS (3),         /* load.  */
>>> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
>>> +    COSTS_N_INSNS (3),         /* ldrd.  */
>>> +    COSTS_N_INSNS (3),         /* ldm_1st.  */
>>> +    1,                         /* ldm_regs_per_insn_1st. */
>>> +    2,                         /* ldm_regs_per_insn_subsequent.  */
>>> +    COSTS_N_INSNS (4),         /* loadf.  */
>>> +    COSTS_N_INSNS (4),         /* loadd.  */
>>> +    COSTS_N_INSNS (4),         /* load_unaligned.  */
>>> +    0,                         /* store.  */
>>> +    0,                         /* strd.  */
>>> +    0,                         /* stm_1st.  */
>>> +    1,                         /* stm_regs_per_insn_1st. */
>>> +    2,                         /* stm_regs_per_insn_subsequent.  */
>>> +    0,                         /* storef.  */
>>> +    0,                         /* stored.  */
>>> +    COSTS_N_INSNS (1),         /* store_unaligned.  */
>>> +    COSTS_N_INSNS (4),         /* loadv.  */
>>> +    COSTS_N_INSNS (4)          /* storev.  */
>>> +  },
>>> +  {
>>> +    /* FP SFmode */
>>> +    {
>>> +      COSTS_N_INSNS (10),      /* div.  */
>>> +      COSTS_N_INSNS (4),       /* mult.  */
>>> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
>>> +      COSTS_N_INSNS (4),       /* fma.  */
>>> +      COSTS_N_INSNS (4),       /* addsub.  */
>>> +      COSTS_N_INSNS (1),       /* fpconst.  */
>>> +      COSTS_N_INSNS (1),       /* neg.  */
>>> +      COSTS_N_INSNS (1),       /* compare.  */
>>> +      COSTS_N_INSNS (2),       /* widen.  */
>>> +      COSTS_N_INSNS (2),       /* narrow.  */
>>> +      COSTS_N_INSNS (2),       /* toint.  */
>>> +      COSTS_N_INSNS (1),       /* fromint.  */
>>> +      COSTS_N_INSNS (2)        /* roundint.  */
>>> +    },
>>> +    /* FP DFmode */
>>> +    {
>>> +      COSTS_N_INSNS (17),      /* div.  */
>>> +      COSTS_N_INSNS (4),       /* mult.  */
>>> +      COSTS_N_INSNS (6),       /* mult_addsub.  */
>>> +      COSTS_N_INSNS (6),       /* fma.  */
>>> +      COSTS_N_INSNS (3),       /* addsub.  */
>>> +      COSTS_N_INSNS (1),       /* fpconst.  */
>>> +      COSTS_N_INSTS (1),       /* neg.  */
>>> +      COSTS_N_INSTS (1),       /* compare.  */
>>> +      COSTS_N_INSNS (2),       /* widen.  */
>>> +      COSTS_N_INSNS (2),       /* narrow.  */
>>> +      COSTS_N_INSNS (2),       /* toint.  */
>>> +      COSTS_N_INSNS (1),       /* fromint.  */
>>> +      COSTS_N_INSNS (2)        /* roundint.  */
>>> +    }
>>> +  },
>>> +  /* Vector */
>>> +  {
>>> +    COSTS_N_INSNS (1)  /* alu.  */
>>> +  }
>>> +};
>>> +
>>>   #endif
>>> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
>>> index 7b3a746..a10f2e7 100644
>>> --- a/gcc/config/aarch64/aarch64-tune.md
>>> +++ b/gcc/config/aarch64/aarch64-tune.md
>>> @@ -1,5 +1,5 @@
>>>   ;; -*- buffer-read-only: t -*-
>>>   ;; Generated automatically by gentune.sh from aarch64-cores.def
>>>   (define_attr "tune"
>>> - "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>>> + "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,tsv110,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>>>           (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>>> index 6bf6c05..0788c14 100644
>>> --- a/gcc/config/aarch64/aarch64.c
>>> +++ b/gcc/config/aarch64/aarch64.c
>>> @@ -266,6 +266,22 @@ static const struct cpu_addrcost_table generic_addrcost_table =
>>>     0 /* imm_offset  */
>>>   };
>>>
>>> +static const struct cpu_addrcost_table tsv110_addrcost_table =
>>> +{
>>> +    {
>>> +      1, /* hi  */
>>> +      0, /* si  */
>>> +      0, /* di  */
>>> +      1, /* ti  */
>>> +    },
>>> +  0, /* pre_modify  */
>>> +  0, /* post_modify  */
>>> +  0, /* register_offset  */
>>> +  1, /* register_sextend  */
>>> +  1, /* register_zextend  */
>>> +  0 /* imm_offset  */
>>> +};
>>> +
>>>   static const struct cpu_addrcost_table exynosm1_addrcost_table =
>>>   {
>>>       {
>>> @@ -344,6 +360,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
>>>     2 /* FP2FP  */
>>>   };
>>>
>>> +static const struct cpu_regmove_cost tsv110_regmove_cost =
>>> +{
>>> +  1, /* GP2GP  */
>>> +  /* Avoid the use of slow int<->fp moves for spilling by setting
>>> +     their cost higher than memmov_cost.  */
>>> +  2, /* GP2FP  */
>>> +  3, /* FP2GP  */
>>> +  2  /* FP2FP  */
>>> +};
>>> +
>>>   static const struct cpu_regmove_cost exynosm1_regmove_cost =
>>>   {
>>>     1, /* GP2GP  */
>>> @@ -450,6 +476,25 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
>>>     1 /* cond_not_taken_branch_cost  */
>>>   };
>>>
>>> +static const struct cpu_vector_cost tsv110_vector_cost =
>>> +{
>>> +  1, /* scalar_int_stmt_cost  */
>>> +  1, /* scalar_fp_stmt_cost  */
>>> +  5, /* scalar_load_cost  */
>>> +  1, /* scalar_store_cost  */
>>> +  2, /* vec_int_stmt_cost  */
>>> +  2, /* vec_fp_stmt_cost  */
>>> +  2, /* vec_permute_cost  */
>>> +  3, /* vec_to_scalar_cost  */
>>> +  2, /* scalar_to_vec_cost  */
>>> +  5, /* vec_align_load_cost  */
>>> +  5, /* vec_unalign_load_cost  */
>>> +  1, /* vec_unalign_store_cost  */
>>> +  1, /* vec_store_cost  */
>>> +  1, /* cond_taken_branch_cost  */
>>> +  1 /* cond_not_taken_branch_cost  */
>>> +};
>>> +
>>>   static const struct cpu_vector_cost exynosm1_vector_cost =
>>>   {
>>>     1, /* scalar_int_stmt_cost  */
>>> @@ -550,6 +595,15 @@ static const cpu_prefetch_tune generic_prefetch_tune =
>>>     -1                   /* default_opt_level  */
>>>   };
>>>
>>> +static const cpu_prefetch_tune tsv110_prefetch_tune =
>>> +{
>>> +  0,                   /* num_slots  */
>>> +  64,                  /* l1_cache_size  */
>>> +  64,                  /* l1_cache_line_size  */
>>> +  512,                 /* l2_cache_size  */
>>> +  -1                   /* default_opt_level  */
>>> +};
>>> +
>>>   static const cpu_prefetch_tune exynosm1_prefetch_tune =
>>>   {
>>>     0,                   /* num_slots  */
>>> @@ -751,6 +805,31 @@ static const struct tune_params cortexa73_tunings =
>>>   };
>>>
>>>
>>> +static const struct tune_params tsv110_tunings =
>>> +{
>>> +  &tsv110_extra_costs,
>>> +  &tsv110_addrcost_table,
>>> +  &tsv110_regmove_cost,
>>> +  &tsv110_vector_cost,
>>> +  &generic_branch_cost,
>>> +  &generic_approx_modes,
>>> +  4, /* memmov_cost  */
>>> +  4, /* issue_rate  */
>>> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
>>> +   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
>>> +  16,  /* function_align.  */
>>> +  4,   /* jump_align.  */
>>> +  8,   /* loop_align.  */
>>> +  2,   /* int_reassoc_width.  */
>>> +  4,   /* fp_reassoc_width.  */
>>> +  1,   /* vec_reassoc_width.  */
>>> +  2,   /* min_div_recip_mul_sf.  */
>>> +  2,   /* min_div_recip_mul_df.  */
>>> +  0,   /* max_case_values.  */
>>> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
>>> +  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
>>> +  &tsv110_prefetch_tune
>>> +};
>>>
>>>   static const struct tune_params exynosm1_tunings =
>>>   {
>>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>>> index beba295..55fcd42 100644
>>> --- a/gcc/doc/invoke.texi
>>> +++ b/gcc/doc/invoke.texi
>>> @@ -14713,7 +14713,7 @@ performance of the code. Permissible values for this option are:
>>>   @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
>>>   @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
>>>   @samp{exynos-m1}, @samp{falkor}, @samp{qdf24xx}, @samp{saphira},
>>> -@samp{xgene1}, @samp{vulcan}, @samp{thunderx},
>>> +@samp{xgene1}, @samp{vulcan}, @samp{thunderx}, @samp{tsv110},
>>>   @samp{thunderxt88}, @samp{thunderxt88p1}, @samp{thunderxt81},
>>>   @samp{thunderxt83}, @samp{thunderx2t99}, @samp{cortex-a57.cortex-a53},
>>>   @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35},
>>> -- 
>>> 2.7.4
>>>
>>
>> .
>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support.
  2018-05-23  8:26       ` Kyrill Tkachov
@ 2018-05-23  9:05         ` Zhangshaokun
  0 siblings, 0 replies; 12+ messages in thread
From: Zhangshaokun @ 2018-05-23  9:05 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches
  Cc: Marcus Shawcroft, Richard Earnshaw (lists), James Greenhalgh, felix.yang

Hi Kyrill,

On 2018/5/23 16:08, Kyrill Tkachov wrote:
> 
> On 23/05/18 05:54, Zhangshaokun wrote:
>> Hi Kyrill,
>>
>> On 2018/5/22 18:52, Kyrill Tkachov wrote:
>>> Hi Shaokun,
>>>
>>> On 22/05/18 09:40, Shaokun Zhang wrote:
>>>> This patch adds HiSilicon's an mcpu: tsv110.
>>>>
>>>> ---
>>>>   gcc/ChangeLog                            |   9 +++
>>>>   gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>>>   gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>>>   gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>>>   gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>>>   gcc/doc/invoke.texi                      |   2 +-
>>>>   6 files changed, 198 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>>>> index cec2892..5d44966 100644
>>>> --- a/gcc/ChangeLog
>>>> +++ b/gcc/ChangeLog
>>>> @@ -1,3 +1,12 @@
>>>> +2018-05-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>>>> +            Bo Zhou  <zbo.zhou@hisilicon.com>
>>>> +
>>>> +       * config/aarch64/aarch64-cores.def (tsv110): New CPU.
>>>> +       * config/aarch64/aarch64-tune.md: Regenerated.
>>>> +       * doc/invoke.texi (AArch61 Options/-mtune): Add "tsv110".
>>> typo: AArch64.
>>>
>> Good catch, my mistake.
>>
>>>> +       * gcc/config/aarch64/aarch64.c (tsv110_tunings): New tuning table.
>>>> +       * gcc/config/aarch64/aarch64-cost-tables.h: Add "tsv110" extra costs.
>>> Please start the path with config/.
>>>
>> Sure, Will remove gcc/ next version.
>>
>>>> +
>>>>   2018-05-21  Michael Meissner <meissner@linux.ibm.com>
>>>>
>>>>           PR target/85657
>>>> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
>>>> index 33b96ca..db7a412 100644
>>>> --- a/gcc/config/aarch64/aarch64-cores.def
>>>> +++ b/gcc/config/aarch64/aarch64-cores.def
>>>> @@ -91,6 +91,11 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2
>>>>   /* Qualcomm ('Q') cores. */
>>>>   AARCH64_CORE("saphira",     saphira,    falkor,    8_3A, AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
>>>>
>>>> +/* ARMv8.4-A Architecture Processors.  */
>>>> +
>>>> +/* HiSilicon ('H') cores. */
>>>> +AARCH64_CORE("tsv110",     tsv110,    tsv110,    8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110,   0x48, 0xd01, -1)
>>>> +
>>> The third field is the scheduler model to use when optimising.
>>> Since there is no tsv110 scheduling model, using the name "tsv110"
>>> in the third field will generally give pretty poor schedules.
>>> I recommend you specify an scheduling model that most closely matches your core
>>> for the time being. But I don't think it's required and I wouldn't let it hold
>> I checked it again, cortexa57 is most closely matches tsv110 and thanks your
>> suggestion.
>> If i choose cortexa57, can i add the tsv110_tunings which will use tsv110's
>> pipeline features, like the rest patch as follow or only use generic feature?
> 
> If you use cortexa57 for the scheduling model (the 3rd field) you should still
> use tsv110_tunings in the 6th field as this will specify other important parameters
> like instruction selection costs, fusion capabilities, alignment requirements etc.
> 

Thanks your comments, i will wait other maintainers comments and prepare next version.
One more question, any thoughts on my cover letter issue that skips DC CVAU for
HiSilicon tsv110 when sync icache and dcache?

Thanks,
Shaokun

> Thanks,
> Kyrill
> 
>>
>>> up the patch.
>>>
>>> You'll need approval from an aarch64 maintainer (cc'ed some for you).
>>>
>> Good, thanks for your nice guidance.
>>
>> Thanks,
>> Shaokun
>>
>>> Thanks,
>>> Kyrill
>>>
>>>>   /* ARMv8-A big.LITTLE implementations.  */
>>>>
>>>>   AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
>>>> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
>>>> index a455c62..b6890d6 100644
>>>> --- a/gcc/config/aarch64/aarch64-cost-tables.h
>>>> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
>>>> @@ -334,4 +334,107 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
>>>>     }
>>>>   };
>>>>
>>>> +const struct cpu_cost_table tsv110_extra_costs =
>>>> +{
>>>> +  /* ALU */
>>>> +  {
>>>> +    0,                 /* arith.  */
>>>> +    0,                 /* logical.  */
>>>> +    0,                 /* shift.  */
>>>> +    0,                 /* shift_reg.  */
>>>> +    COSTS_N_INSNS (1), /* arith_shift.  */
>>>> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
>>>> +    COSTS_N_INSNS (1), /* log_shift.  */
>>>> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
>>>> +    0,                 /* extend.  */
>>>> +    COSTS_N_INSNS (1), /* extend_arith.  */
>>>> +    0,                 /* bfi.  */
>>>> +    0,                 /* bfx.  */
>>>> +    0,                 /* clz.  */
>>>> +    0,                /* rev.  */
>>>> +    0,                 /* non_exec.  */
>>>> +    true               /* non_exec_costs_exec.  */
>>>> +  },
>>>> +  {
>>>> +    /* MULT SImode */
>>>> +    {
>>>> +      COSTS_N_INSNS (2),       /* simple.  */
>>>> +      COSTS_N_INSNS (2),       /* flag_setting.  */
>>>> +      COSTS_N_INSNS (2),       /* extend.  */
>>>> +      COSTS_N_INSNS (2),       /* add.  */
>>>> +      COSTS_N_INSNS (2),       /* extend_add.  */
>>>> +      COSTS_N_INSNS (11)       /* idiv.  */
>>>> +    },
>>>> +    /* MULT DImode */
>>>> +    {
>>>> +      COSTS_N_INSNS (3),       /* simple.  */
>>>> +      0,                       /* flag_setting (N/A).  */
>>>> +      COSTS_N_INSNS (3),       /* extend.  */
>>>> +      COSTS_N_INSNS (3),       /* add.  */
>>>> +      COSTS_N_INSNS (3),       /* extend_add.  */
>>>> +      COSTS_N_INSNS (19)       /* idiv.  */
>>>> +    }
>>>> +  },
>>>> +  /* LD/ST */
>>>> +  {
>>>> +    COSTS_N_INSNS (3),         /* load.  */
>>>> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
>>>> +    COSTS_N_INSNS (3),         /* ldrd.  */
>>>> +    COSTS_N_INSNS (3),         /* ldm_1st.  */
>>>> +    1,                         /* ldm_regs_per_insn_1st. */
>>>> +    2,                         /* ldm_regs_per_insn_subsequent.  */
>>>> +    COSTS_N_INSNS (4),         /* loadf.  */
>>>> +    COSTS_N_INSNS (4),         /* loadd.  */
>>>> +    COSTS_N_INSNS (4),         /* load_unaligned.  */
>>>> +    0,                         /* store.  */
>>>> +    0,                         /* strd.  */
>>>> +    0,                         /* stm_1st.  */
>>>> +    1,                         /* stm_regs_per_insn_1st. */
>>>> +    2,                         /* stm_regs_per_insn_subsequent.  */
>>>> +    0,                         /* storef.  */
>>>> +    0,                         /* stored.  */
>>>> +    COSTS_N_INSNS (1),         /* store_unaligned.  */
>>>> +    COSTS_N_INSNS (4),         /* loadv.  */
>>>> +    COSTS_N_INSNS (4)          /* storev.  */
>>>> +  },
>>>> +  {
>>>> +    /* FP SFmode */
>>>> +    {
>>>> +      COSTS_N_INSNS (10),      /* div.  */
>>>> +      COSTS_N_INSNS (4),       /* mult.  */
>>>> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
>>>> +      COSTS_N_INSNS (4),       /* fma.  */
>>>> +      COSTS_N_INSNS (4),       /* addsub.  */
>>>> +      COSTS_N_INSNS (1),       /* fpconst.  */
>>>> +      COSTS_N_INSNS (1),       /* neg.  */
>>>> +      COSTS_N_INSNS (1),       /* compare.  */
>>>> +      COSTS_N_INSNS (2),       /* widen.  */
>>>> +      COSTS_N_INSNS (2),       /* narrow.  */
>>>> +      COSTS_N_INSNS (2),       /* toint.  */
>>>> +      COSTS_N_INSNS (1),       /* fromint.  */
>>>> +      COSTS_N_INSNS (2)        /* roundint.  */
>>>> +    },
>>>> +    /* FP DFmode */
>>>> +    {
>>>> +      COSTS_N_INSNS (17),      /* div.  */
>>>> +      COSTS_N_INSNS (4),       /* mult.  */
>>>> +      COSTS_N_INSNS (6),       /* mult_addsub.  */
>>>> +      COSTS_N_INSNS (6),       /* fma.  */
>>>> +      COSTS_N_INSNS (3),       /* addsub.  */
>>>> +      COSTS_N_INSNS (1),       /* fpconst.  */
>>>> +      COSTS_N_INSTS (1),       /* neg.  */
>>>> +      COSTS_N_INSTS (1),       /* compare.  */
>>>> +      COSTS_N_INSNS (2),       /* widen.  */
>>>> +      COSTS_N_INSNS (2),       /* narrow.  */
>>>> +      COSTS_N_INSNS (2),       /* toint.  */
>>>> +      COSTS_N_INSNS (1),       /* fromint.  */
>>>> +      COSTS_N_INSNS (2)        /* roundint.  */
>>>> +    }
>>>> +  },
>>>> +  /* Vector */
>>>> +  {
>>>> +    COSTS_N_INSNS (1)  /* alu.  */
>>>> +  }
>>>> +};
>>>> +
>>>>   #endif
>>>> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
>>>> index 7b3a746..a10f2e7 100644
>>>> --- a/gcc/config/aarch64/aarch64-tune.md
>>>> +++ b/gcc/config/aarch64/aarch64-tune.md
>>>> @@ -1,5 +1,5 @@
>>>>   ;; -*- buffer-read-only: t -*-
>>>>   ;; Generated automatically by gentune.sh from aarch64-cores.def
>>>>   (define_attr "tune"
>>>> - "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>>>> + "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,thunderxt81,thunderxt83,xgene1,falkor,qdf24xx,exynosm1,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,saphira,tsv110,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55"
>>>>           (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
>>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>>>> index 6bf6c05..0788c14 100644
>>>> --- a/gcc/config/aarch64/aarch64.c
>>>> +++ b/gcc/config/aarch64/aarch64.c
>>>> @@ -266,6 +266,22 @@ static const struct cpu_addrcost_table generic_addrcost_table =
>>>>     0 /* imm_offset  */
>>>>   };
>>>>
>>>> +static const struct cpu_addrcost_table tsv110_addrcost_table =
>>>> +{
>>>> +    {
>>>> +      1, /* hi  */
>>>> +      0, /* si  */
>>>> +      0, /* di  */
>>>> +      1, /* ti  */
>>>> +    },
>>>> +  0, /* pre_modify  */
>>>> +  0, /* post_modify  */
>>>> +  0, /* register_offset  */
>>>> +  1, /* register_sextend  */
>>>> +  1, /* register_zextend  */
>>>> +  0 /* imm_offset  */
>>>> +};
>>>> +
>>>>   static const struct cpu_addrcost_table exynosm1_addrcost_table =
>>>>   {
>>>>       {
>>>> @@ -344,6 +360,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
>>>>     2 /* FP2FP  */
>>>>   };
>>>>
>>>> +static const struct cpu_regmove_cost tsv110_regmove_cost =
>>>> +{
>>>> +  1, /* GP2GP  */
>>>> +  /* Avoid the use of slow int<->fp moves for spilling by setting
>>>> +     their cost higher than memmov_cost.  */
>>>> +  2, /* GP2FP  */
>>>> +  3, /* FP2GP  */
>>>> +  2  /* FP2FP  */
>>>> +};
>>>> +
>>>>   static const struct cpu_regmove_cost exynosm1_regmove_cost =
>>>>   {
>>>>     1, /* GP2GP  */
>>>> @@ -450,6 +476,25 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
>>>>     1 /* cond_not_taken_branch_cost  */
>>>>   };
>>>>
>>>> +static const struct cpu_vector_cost tsv110_vector_cost =
>>>> +{
>>>> +  1, /* scalar_int_stmt_cost  */
>>>> +  1, /* scalar_fp_stmt_cost  */
>>>> +  5, /* scalar_load_cost  */
>>>> +  1, /* scalar_store_cost  */
>>>> +  2, /* vec_int_stmt_cost  */
>>>> +  2, /* vec_fp_stmt_cost  */
>>>> +  2, /* vec_permute_cost  */
>>>> +  3, /* vec_to_scalar_cost  */
>>>> +  2, /* scalar_to_vec_cost  */
>>>> +  5, /* vec_align_load_cost  */
>>>> +  5, /* vec_unalign_load_cost  */
>>>> +  1, /* vec_unalign_store_cost  */
>>>> +  1, /* vec_store_cost  */
>>>> +  1, /* cond_taken_branch_cost  */
>>>> +  1 /* cond_not_taken_branch_cost  */
>>>> +};
>>>> +
>>>>   static const struct cpu_vector_cost exynosm1_vector_cost =
>>>>   {
>>>>     1, /* scalar_int_stmt_cost  */
>>>> @@ -550,6 +595,15 @@ static const cpu_prefetch_tune generic_prefetch_tune =
>>>>     -1                   /* default_opt_level  */
>>>>   };
>>>>
>>>> +static const cpu_prefetch_tune tsv110_prefetch_tune =
>>>> +{
>>>> +  0,                   /* num_slots  */
>>>> +  64,                  /* l1_cache_size  */
>>>> +  64,                  /* l1_cache_line_size  */
>>>> +  512,                 /* l2_cache_size  */
>>>> +  -1                   /* default_opt_level  */
>>>> +};
>>>> +
>>>>   static const cpu_prefetch_tune exynosm1_prefetch_tune =
>>>>   {
>>>>     0,                   /* num_slots  */
>>>> @@ -751,6 +805,31 @@ static const struct tune_params cortexa73_tunings =
>>>>   };
>>>>
>>>>
>>>> +static const struct tune_params tsv110_tunings =
>>>> +{
>>>> +  &tsv110_extra_costs,
>>>> +  &tsv110_addrcost_table,
>>>> +  &tsv110_regmove_cost,
>>>> +  &tsv110_vector_cost,
>>>> +  &generic_branch_cost,
>>>> +  &generic_approx_modes,
>>>> +  4, /* memmov_cost  */
>>>> +  4, /* issue_rate  */
>>>> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
>>>> +   | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
>>>> +  16,  /* function_align.  */
>>>> +  4,   /* jump_align.  */
>>>> +  8,   /* loop_align.  */
>>>> +  2,   /* int_reassoc_width.  */
>>>> +  4,   /* fp_reassoc_width.  */
>>>> +  1,   /* vec_reassoc_width.  */
>>>> +  2,   /* min_div_recip_mul_sf.  */
>>>> +  2,   /* min_div_recip_mul_df.  */
>>>> +  0,   /* max_case_values.  */
>>>> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
>>>> +  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
>>>> +  &tsv110_prefetch_tune
>>>> +};
>>>>
>>>>   static const struct tune_params exynosm1_tunings =
>>>>   {
>>>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>>>> index beba295..55fcd42 100644
>>>> --- a/gcc/doc/invoke.texi
>>>> +++ b/gcc/doc/invoke.texi
>>>> @@ -14713,7 +14713,7 @@ performance of the code. Permissible values for this option are:
>>>>   @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
>>>>   @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
>>>>   @samp{exynos-m1}, @samp{falkor}, @samp{qdf24xx}, @samp{saphira},
>>>> -@samp{xgene1}, @samp{vulcan}, @samp{thunderx},
>>>> +@samp{xgene1}, @samp{vulcan}, @samp{thunderx}, @samp{tsv110},
>>>>   @samp{thunderxt88}, @samp{thunderxt88p1}, @samp{thunderxt81},
>>>>   @samp{thunderxt83}, @samp{thunderx2t99}, @samp{cortex-a57.cortex-a53},
>>>>   @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35},
>>>> -- 
>>>> 2.7.4
>>>>
>>>
>>> .
>>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support
  2018-05-23  3:51   ` Zhangshaokun
@ 2018-05-23 10:52     ` Ramana Radhakrishnan
  2018-06-01  9:56       ` Zhangshaokun
  0 siblings, 1 reply; 12+ messages in thread
From: Ramana Radhakrishnan @ 2018-05-23 10:52 UTC (permalink / raw)
  To: Zhangshaokun
  Cc: gcc-patches, Marcus Shawcroft, Richard Earnshaw (lists),
	James Greenhalgh, Kyrill Tkachov, felix.yang



On 23/05/2018 03:50, Zhangshaokun wrote:
> Hi Ramana,
> 
> On 2018/5/22 18:28, Ramana Radhakrishnan wrote:
>> On Tue, May 22, 2018 at 9:40 AM, Shaokun Zhang
>> <zhangshaokun@hisilicon.com> wrote:
>>> tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
>>> L1 Icache which can access L1 Dcache.
>>> Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
>>> tsv110, is there any good idea to skip DC CVAU operation for tsv110.
>>
>> A solution would be to use an ifunc but on a cpu variant.
>>
> 
> ifunc, can you give further explanation?
> If on a cpu variant, for HiSilicon tsv110, we have two version and CPU variants
> are 0 and 1. Both are expected to skip DC CVAU operation in sync icache and
> dcache.

>> Since it is not necessary for sync icache and dcache, it is beneficial for
>> performance to skip the redundant DC CVAU and do IC IVAU only.
>> For JVM, __clear_cache is called many times.
>> 

Thanks for some more detail as to where you think you want to use this. 
Have you investigated whether the jvm can actually elide such a call 
rather than trying to fix this in the toolchain ?

If you really need to think about solutions in the toolchain -

The simplest first step would be to implement the changes hinted at by 
the comment in aarch64.h .

  If you read the comment above CLEAR_INSN_CACHE in aarch64.h you would 
see that

/* This definition should be relocated to aarch64-elf-raw.h.  This macro
    should be undefined in aarch64-linux.h and a clear_cache pattern
    implmented to emit either the call to __aarch64_sync_cache_range()
    directly or preferably the appropriate sycall or cache clear
    instructions inline.  */
#define CLEAR_INSN_CACHE(beg, end)                              \
   extern void  __aarch64_sync_cache_range (void *, void *);     \
   __aarch64_sync_cache_range (beg, end)

Thus I would expect that by implementing the clear_cache pattern and 
deciding whether to put out the call to the __aarch64_sync_cache_range 
function or not depending on whether you had the tsv110 chosen on the 
command line would allow you to have an idea of what the performance 
gain actually is by compiling the jvm with -mcpu=tsv110 vs 
-march=armv8-a. You probably also want to clean up the trampoline_init 
code while you are here.

I do think that's something that should be easy enough to do and the 
subject of a patch series in its own right. If your users can rebuild 
the world for tsv110 then this is sufficient.

If you want to have a single jvm binary without any run time checks, 
then you need to investigate the use of ifuncs which are a mechanism in 
the GNU toolchain for some of this kind of stuff. We tend not to ifuncs 
on a per CPU basis unless there is a very good reason and the 
performance improvement is worth it (but probably more on a per 
architecture or per architectural basis) and you will need to make the 
case for it including what sort of performance benefits it gives. Some 
introduction about this feature can be found here. 
https://sourceware.org/glibc/wiki/GNU_IFUNC

regards
Ramana

> 
> Hi ARM guys,
> are you happy to share yours idea about this?
> 
>> Is this really that important for performance and on what workloads ?
>>
> 
> Since it is not necessary for sync icache and dcache, it is beneficial for
> performance to skip the redundant DC CVAU and do IC IVAU only.
> For JVM, __clear_cache is called many times.
> 
> Thanks,
> Shaokun
> 
>> regards
>> Ramana
>>
>>>
>>> Any thoughts and ideas are welcome.
>>>
>>> Shaokun Zhang (1):
>>>    [aarch64] Add HiSilicon tsv110 CPU support.
>>>
>>>   gcc/ChangeLog                            |   9 +++
>>>   gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>>   gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>>   gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>>   gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>>   gcc/doc/invoke.texi                      |   2 +-
>>>   6 files changed, 198 insertions(+), 2 deletions(-)
>>>
>>> --
>>> 2.7.4
>>>
>>
>>
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support
  2018-05-23 10:52     ` Ramana Radhakrishnan
@ 2018-06-01  9:56       ` Zhangshaokun
  2018-06-06 14:51         ` Kyrill Tkachov
  0 siblings, 1 reply; 12+ messages in thread
From: Zhangshaokun @ 2018-06-01  9:56 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: gcc-patches, Marcus Shawcroft, Richard Earnshaw (lists),
	James Greenhalgh, Kyrill Tkachov, felix.yang

Hi Ramana,

Sorry to reply so later because of short leave.

On 2018/5/23 18:41, Ramana Radhakrishnan wrote:
> 
> 
> On 23/05/2018 03:50, Zhangshaokun wrote:
>> Hi Ramana,
>>
>> On 2018/5/22 18:28, Ramana Radhakrishnan wrote:
>>> On Tue, May 22, 2018 at 9:40 AM, Shaokun Zhang
>>> <zhangshaokun@hisilicon.com> wrote:
>>>> tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
>>>> L1 Icache which can access L1 Dcache.
>>>> Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
>>>> tsv110, is there any good idea to skip DC CVAU operation for tsv110.
>>>
>>> A solution would be to use an ifunc but on a cpu variant.
>>>
>>
>> ifunc, can you give further explanation?
>> If on a cpu variant, for HiSilicon tsv110, we have two version and CPU variants
>> are 0 and 1. Both are expected to skip DC CVAU operation in sync icache and
>> dcache.
> 
>>> Since it is not necessary for sync icache and dcache, it is beneficial for
>>> performance to skip the redundant DC CVAU and do IC IVAU only.
>>> For JVM, __clear_cache is called many times.
>>>
> 
> Thanks for some more detail as to where you think you want to use this. Have you investigated whether the jvm can actually elide such a call rather than trying to fix this in the toolchain ?
> 

In fact, We(HiSilicon) want optimize DC CVAU not only in the toolchain, but also for LLVM and others.

> If you really need to think about solutions in the toolchain -
> 
> The simplest first step would be to implement the changes hinted at by the comment in aarch64.h .
> 
>  If you read the comment above CLEAR_INSN_CACHE in aarch64.h you would see that
> 
> /* This definition should be relocated to aarch64-elf-raw.h.  This macro
>    should be undefined in aarch64-linux.h and a clear_cache pattern
>    implmented to emit either the call to __aarch64_sync_cache_range()
>    directly or preferably the appropriate sycall or cache clear
>    instructions inline.  */
> #define CLEAR_INSN_CACHE(beg, end)                              \
>   extern void  __aarch64_sync_cache_range (void *, void *);     \
>   __aarch64_sync_cache_range (beg, end)
> 
> Thus I would expect that by implementing the clear_cache pattern and deciding whether to put out the call to the __aarch64_sync_cache_range function or not depending on whether you had the tsv110 chosen on the command line would allow you to have an idea of what the performance gain actually is by compiling the jvm with -mcpu=tsv110 vs -march=armv8-a. You probably also want to clean up the trampoline_init code while you are here.
> 

Thanks for your nice explanation and guidance.
For our next generation cpu core tsv200, We will optimize for IC IVAU that there is no need to
flush Icache, keep the clear_cache as NOP function. Shall I consider this? or Maybe i lose
something what your said.

Thanks,
Shaokun

> I do think that's something that should be easy enough to do and the subject of a patch series in its own right. If your users can rebuild the world for tsv110 then this is sufficient.
> 
> If you want to have a single jvm binary without any run time checks, then you need to investigate the use of ifuncs which are a mechanism in the GNU toolchain for some of this kind of stuff. We tend not to ifuncs on a per CPU basis unless there is a very good reason and the performance improvement is worth it (but probably more on a per architecture or per architectural basis) and you will need to make the case for it including what sort of performance benefits it gives. Some introduction about this feature can be found here. https://sourceware.org/glibc/wiki/GNU_IFUNC
> 
> regards
> Ramana
> 
>>
>> Hi ARM guys,
>> are you happy to share yours idea about this?
>>
>>> Is this really that important for performance and on what workloads ?
>>>
>>
>> Since it is not necessary for sync icache and dcache, it is beneficial for
>> performance to skip the redundant DC CVAU and do IC IVAU only.
>> For JVM, __clear_cache is called many times.
>>
>> Thanks,
>> Shaokun
>>
>>> regards
>>> Ramana
>>>
>>>>
>>>> Any thoughts and ideas are welcome.
>>>>
>>>> Shaokun Zhang (1):
>>>>    [aarch64] Add HiSilicon tsv110 CPU support.
>>>>
>>>>   gcc/ChangeLog                            |   9 +++
>>>>   gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>>>   gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>>>   gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>>>   gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>>>   gcc/doc/invoke.texi                      |   2 +-
>>>>   6 files changed, 198 insertions(+), 2 deletions(-)
>>>>
>>>> -- 
>>>> 2.7.4
>>>>
>>>
>>>
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support
  2018-06-01  9:56       ` Zhangshaokun
@ 2018-06-06 14:51         ` Kyrill Tkachov
  2018-06-07  9:07           ` Zhangshaokun
  0 siblings, 1 reply; 12+ messages in thread
From: Kyrill Tkachov @ 2018-06-06 14:51 UTC (permalink / raw)
  To: Zhangshaokun, Ramana Radhakrishnan
  Cc: gcc-patches, Marcus Shawcroft, Richard Earnshaw (lists),
	James Greenhalgh, felix.yang

Hi Shaokun,

On 01/06/18 10:56, Zhangshaokun wrote:
> Hi Ramana,
>
> Sorry to reply so later because of short leave.
>
> On 2018/5/23 18:41, Ramana Radhakrishnan wrote:
>>
>> On 23/05/2018 03:50, Zhangshaokun wrote:
>>> Hi Ramana,
>>>
>>> On 2018/5/22 18:28, Ramana Radhakrishnan wrote:
>>>> On Tue, May 22, 2018 at 9:40 AM, Shaokun Zhang
>>>> <zhangshaokun@hisilicon.com> wrote:
>>>>> tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
>>>>> L1 Icache which can access L1 Dcache.
>>>>> Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
>>>>> tsv110, is there any good idea to skip DC CVAU operation for tsv110.
>>>> A solution would be to use an ifunc but on a cpu variant.
>>>>
>>> ifunc, can you give further explanation?
>>> If on a cpu variant, for HiSilicon tsv110, we have two version and CPU variants
>>> are 0 and 1. Both are expected to skip DC CVAU operation in sync icache and
>>> dcache.
>>>> Since it is not necessary for sync icache and dcache, it is beneficial for
>>>> performance to skip the redundant DC CVAU and do IC IVAU only.
>>>> For JVM, __clear_cache is called many times.
>>>>
>> Thanks for some more detail as to where you think you want to use this. Have you investigated whether the jvm can actually elide such a call rather than trying to fix this in the toolchain ?
>>
> In fact, We(HiSilicon) want optimize DC CVAU not only in the toolchain, but also for LLVM and others.
>
>> If you really need to think about solutions in the toolchain -
>>
>> The simplest first step would be to implement the changes hinted at by the comment in aarch64.h .
>>
>>   If you read the comment above CLEAR_INSN_CACHE in aarch64.h you would see that
>>
>> /* This definition should be relocated to aarch64-elf-raw.h.  This macro
>>     should be undefined in aarch64-linux.h and a clear_cache pattern
>>     implmented to emit either the call to __aarch64_sync_cache_range()
>>     directly or preferably the appropriate sycall or cache clear
>>     instructions inline.  */
>> #define CLEAR_INSN_CACHE(beg, end)                              \
>>    extern void  __aarch64_sync_cache_range (void *, void *);     \
>>    __aarch64_sync_cache_range (beg, end)
>>
>> Thus I would expect that by implementing the clear_cache pattern and deciding whether to put out the call to the __aarch64_sync_cache_range function or not depending on whether you had the tsv110 chosen on the command line would allow you to have an idea of what the performance gain actually is by compiling the jvm with -mcpu=tsv110 vs -march=armv8-a. You probably also want to clean up the trampoline_init code while you are here.
>>
> Thanks for your nice explanation and guidance.
> For our next generation cpu core tsv200, We will optimize for IC IVAU that there is no need to
> flush Icache, keep the clear_cache as NOP function. Shall I consider this? or Maybe i lose
> something what your said.

I've had a look at the __clear_cache implementation and investigated these cache coherency bits.
If clearing the instruction cache means you don't need to explicitly clear the data cache then
the IDC bit of the CTR_EL0 register will be set to 1. This is how you can identify that you can
avoid the explicit "DC CVAU" in __clear_cache.
Have a look at the D10.2.33 section in the Arm Architecture Reference Manual Issue C.a [1]
for more documentation.

To implement this elision in libgcc you'd need to extend __arch64_sync_cache_range
in config/aarch64/sync_cache.c to read the IDC bit from CTR_EL0.
The code there already reads CTR_EL0 and caches its value so you just need to extract that bit
and use it to decide whether to perform the "DC CVAU" loop.

But that should a patch on its own.
Your patch to add a tsv110 entry into aarch64-cores.def can be respun and reviewed separately.

Thanks,
Kyrill

[1] https://developer.arm.com/products/architecture/a-profile/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile


> Thanks,
> Shaokun
>
>> I do think that's something that should be easy enough to do and the subject of a patch series in its own right. If your users can rebuild the world for tsv110 then this is sufficient.
>>
>> If you want to have a single jvm binary without any run time checks, then you need to investigate the use of ifuncs which are a mechanism in the GNU toolchain for some of this kind of stuff. We tend not to ifuncs on a per CPU basis unless there is a very good reason and the performance improvement is worth it (but probably more on a per architecture or per architectural basis) and you will need to make the case for it including what sort of performance benefits it gives. Some introduction about this feature can be found here. https://sourceware.org/glibc/wiki/GNU_IFUNC
>>
>> regards
>> Ramana
>>
>>> Hi ARM guys,
>>> are you happy to share yours idea about this?
>>>
>>>> Is this really that important for performance and on what workloads ?
>>>>
>>> Since it is not necessary for sync icache and dcache, it is beneficial for
>>> performance to skip the redundant DC CVAU and do IC IVAU only.
>>> For JVM, __clear_cache is called many times.
>>>
>>> Thanks,
>>> Shaokun
>>>
>>>> regards
>>>> Ramana
>>>>
>>>>> Any thoughts and ideas are welcome.
>>>>>
>>>>> Shaokun Zhang (1):
>>>>>     [aarch64] Add HiSilicon tsv110 CPU support.
>>>>>
>>>>>    gcc/ChangeLog                            |   9 +++
>>>>>    gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>>>>    gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>>>>    gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>>>>    gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>>>>    gcc/doc/invoke.texi                      |   2 +-
>>>>>    6 files changed, 198 insertions(+), 2 deletions(-)
>>>>>
>>>>> -- 
>>>>> 2.7.4
>>>>>
>>>>
>> .
>>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [aarch64] Add HiSilicon tsv110 CPU support
  2018-06-06 14:51         ` Kyrill Tkachov
@ 2018-06-07  9:07           ` Zhangshaokun
  0 siblings, 0 replies; 12+ messages in thread
From: Zhangshaokun @ 2018-06-07  9:07 UTC (permalink / raw)
  To: Kyrill Tkachov, Ramana Radhakrishnan
  Cc: gcc-patches, Marcus Shawcroft, Richard Earnshaw (lists),
	James Greenhalgh, felix.yang

Hi Kyrill,

On 2018/6/6 22:51, Kyrill Tkachov wrote:
> Hi Shaokun,
> 
> On 01/06/18 10:56, Zhangshaokun wrote:
>> Hi Ramana,
>>
>> Sorry to reply so later because of short leave.
>>
>> On 2018/5/23 18:41, Ramana Radhakrishnan wrote:
>>>
>>> On 23/05/2018 03:50, Zhangshaokun wrote:
>>>> Hi Ramana,
>>>>
>>>> On 2018/5/22 18:28, Ramana Radhakrishnan wrote:
>>>>> On Tue, May 22, 2018 at 9:40 AM, Shaokun Zhang
>>>>> <zhangshaokun@hisilicon.com> wrote:
>>>>>> tsv110 is designed by HiSilicon and supports v8_4A, it also optimizes
>>>>>> L1 Icache which can access L1 Dcache.
>>>>>> Therefore, DC CVAU is not necessary in __aarch64_sync_cache_range for
>>>>>> tsv110, is there any good idea to skip DC CVAU operation for tsv110.
>>>>> A solution would be to use an ifunc but on a cpu variant.
>>>>>
>>>> ifunc, can you give further explanation?
>>>> If on a cpu variant, for HiSilicon tsv110, we have two version and CPU variants
>>>> are 0 and 1. Both are expected to skip DC CVAU operation in sync icache and
>>>> dcache.
>>>>> Since it is not necessary for sync icache and dcache, it is beneficial for
>>>>> performance to skip the redundant DC CVAU and do IC IVAU only.
>>>>> For JVM, __clear_cache is called many times.
>>>>>
>>> Thanks for some more detail as to where you think you want to use this. Have you investigated whether the jvm can actually elide such a call rather than trying to fix this in the toolchain ?
>>>
>> In fact, We(HiSilicon) want optimize DC CVAU not only in the toolchain, but also for LLVM and others.
>>
>>> If you really need to think about solutions in the toolchain -
>>>
>>> The simplest first step would be to implement the changes hinted at by the comment in aarch64.h .
>>>
>>>   If you read the comment above CLEAR_INSN_CACHE in aarch64.h you would see that
>>>
>>> /* This definition should be relocated to aarch64-elf-raw.h.  This macro
>>>     should be undefined in aarch64-linux.h and a clear_cache pattern
>>>     implmented to emit either the call to __aarch64_sync_cache_range()
>>>     directly or preferably the appropriate sycall or cache clear
>>>     instructions inline.  */
>>> #define CLEAR_INSN_CACHE(beg, end)                              \
>>>    extern void  __aarch64_sync_cache_range (void *, void *);     \
>>>    __aarch64_sync_cache_range (beg, end)
>>>
>>> Thus I would expect that by implementing the clear_cache pattern and deciding whether to put out the call to the __aarch64_sync_cache_range function or not depending on whether you had the tsv110 chosen on the command line would allow you to have an idea of what the performance gain actually is by compiling the jvm with -mcpu=tsv110 vs -march=armv8-a. You probably also want to clean up the trampoline_init code while you are here.
>>>
>> Thanks for your nice explanation and guidance.
>> For our next generation cpu core tsv200, We will optimize for IC IVAU that there is no need to
>> flush Icache, keep the clear_cache as NOP function. Shall I consider this? or Maybe i lose
>> something what your said.
> 
> I've had a look at the __clear_cache implementation and investigated these cache coherency bits.
> If clearing the instruction cache means you don't need to explicitly clear the data cache then
> the IDC bit of the CTR_EL0 register will be set to 1. This is how you can identify that you can

Thanks your guidance, I check it again that IDC bit has been added in CTR_EL0 in ARMv8.5. It is
a pity that our tsv110 core doesn't enable this bit which supports v8.4, but adds elision DC CVAU
feature. For HiSilion tsv200, IDC and IDC will be enabled.

> avoid the explicit "DC CVAU" in __clear_cache.
> Have a look at the D10.2.33 section in the Arm Architecture Reference Manual Issue C.a [1]
> for more documentation.
> 
> To implement this elision in libgcc you'd need to extend __arch64_sync_cache_range
> in config/aarch64/sync_cache.c to read the IDC bit from CTR_EL0.
> The code there already reads CTR_EL0 and caches its value so you just need to extract that bit
> and use it to decide whether to perform the "DC CVAU" loop.
> 
> But that should a patch on its own.

Got it, both IDC and DIC shall be checked in this function. It seems pretty good for performance
if the cpu core supports the two features.
Are you happy that I add this check patch or you can do it yourself, either is okay for me.

> Your patch to add a tsv110 entry into aarch64-cores.def can be respun and reviewed separately.

Sure, I will fix it and send patch v2 to mail-list and maintainers.

Thanks,
Shaokun

> 
> Thanks,
> Kyrill
> 
> [1] https://developer.arm.com/products/architecture/a-profile/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile
> 
> 
>> Thanks,
>> Shaokun
>>
>>> I do think that's something that should be easy enough to do and the subject of a patch series in its own right. If your users can rebuild the world for tsv110 then this is sufficient.
>>>
>>> If you want to have a single jvm binary without any run time checks, then you need to investigate the use of ifuncs which are a mechanism in the GNU toolchain for some of this kind of stuff. We tend not to ifuncs on a per CPU basis unless there is a very good reason and the performance improvement is worth it (but probably more on a per architecture or per architectural basis) and you will need to make the case for it including what sort of performance benefits it gives. Some introduction about this feature can be found here. https://sourceware.org/glibc/wiki/GNU_IFUNC
>>>
>>> regards
>>> Ramana
>>>
>>>> Hi ARM guys,
>>>> are you happy to share yours idea about this?
>>>>
>>>>> Is this really that important for performance and on what workloads ?
>>>>>
>>>> Since it is not necessary for sync icache and dcache, it is beneficial for
>>>> performance to skip the redundant DC CVAU and do IC IVAU only.
>>>> For JVM, __clear_cache is called many times.
>>>>
>>>> Thanks,
>>>> Shaokun
>>>>
>>>>> regards
>>>>> Ramana
>>>>>
>>>>>> Any thoughts and ideas are welcome.
>>>>>>
>>>>>> Shaokun Zhang (1):
>>>>>>     [aarch64] Add HiSilicon tsv110 CPU support.
>>>>>>
>>>>>>    gcc/ChangeLog                            |   9 +++
>>>>>>    gcc/config/aarch64/aarch64-cores.def     |   5 ++
>>>>>>    gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++++++++++
>>>>>>    gcc/config/aarch64/aarch64-tune.md       |   2 +-
>>>>>>    gcc/config/aarch64/aarch64.c             |  79 ++++++++++++++++++++++++
>>>>>>    gcc/doc/invoke.texi                      |   2 +-
>>>>>>    6 files changed, 198 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> -- 
>>>>>> 2.7.4
>>>>>>
>>>>>
>>> .
>>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-06-07  9:07 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-22  8:41 [RFC] [aarch64] Add HiSilicon tsv110 CPU support Shaokun Zhang
2018-05-22  8:43 ` Shaokun Zhang
2018-05-22 11:23   ` Kyrill Tkachov
2018-05-23  5:34     ` Zhangshaokun
2018-05-23  8:26       ` Kyrill Tkachov
2018-05-23  9:05         ` Zhangshaokun
2018-05-22 10:53 ` Ramana Radhakrishnan
2018-05-23  3:51   ` Zhangshaokun
2018-05-23 10:52     ` Ramana Radhakrishnan
2018-06-01  9:56       ` Zhangshaokun
2018-06-06 14:51         ` Kyrill Tkachov
2018-06-07  9:07           ` Zhangshaokun

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).