public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/2, AArch64] Core definition for APM XGene-1 and associated cost-table.
  2014-11-19 17:34 [PATCH 0/2, AArch64] APM X-Gene 1 cost-table and pipeline model Philipp Tomsich
@ 2014-11-19 17:33 ` Philipp Tomsich
  2014-11-19 18:00   ` Kyrill Tkachov
  2014-11-19 17:36 ` [PATCH 2/2, AArch64] Pipeline model for APM XGene-1 Philipp Tomsich
  1 sibling, 1 reply; 14+ messages in thread
From: Philipp Tomsich @ 2014-11-19 17:33 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus.shawcroft, benedikt.huber, ksankaran, Philipp Tomsich

To keep this change separately buildable from the pipeline model,
this patch directs the APM XGene-1 to use the generic scheduling
model.
---
 gcc/ChangeLog                        |   7 +++
 gcc/config/aarch64/aarch64-cores.def |   1 +
 gcc/config/aarch64/aarch64-tune.md   |   2 +-
 gcc/config/aarch64/aarch64.c         |  62 +++++++++++++++++++++
 gcc/config/arm/aarch-cost-tables.h   | 101 +++++++++++++++++++++++++++++++++++
 5 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2fa58ca..5b389c5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-cores.def (xgene1): Update/add the
+	xgene1 (APM XGene-1) core definition.
+	* gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
+	* config/arm/aarch-cost-tables.h: Add cost tables for APM XGene-1
+
 2014-11-18  Maciej W. Rozycki  <macro@codesourcery.com>
 
 	* config/mips/mips.md (compression): Add `micromips32' setting.
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 312941f..e553e50 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -37,6 +37,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC, cortexa53)
 AARCH64_CORE("cortex-a57",  cortexa15, cortexa15, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC, cortexa57)
 AARCH64_CORE("thunderx",    thunderx,  thunderx, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx)
+AARCH64_CORE("xgene1",      xgene1,    xgene1,    8,  AARCH64_FL_FPSIMD, xgene1)
 
 /* V8 big.LITTLE implementations.  */
 
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index c717ea8..6409082 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa53,cortexa15,thunderx,cortexa57cortexa53"
+	"cortexa53,cortexa15,thunderx,xgene1,cortexa57cortexa53"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4fec21e..9b92527 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -226,6 +226,27 @@ static const struct cpu_addrcost_table cortexa57_addrcost_table =
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
+static const struct cpu_addrcost_table xgene1_addrcost_table =
+{
+#if HAVE_DESIGNATED_INITIALIZERS
+  .addr_scale_costs =
+#endif
+    {
+      NAMED_PARAM (hi, 1),
+      NAMED_PARAM (si, 0),
+      NAMED_PARAM (di, 0),
+      NAMED_PARAM (ti, 1),
+    },
+  NAMED_PARAM (pre_modify, 1),
+  NAMED_PARAM (post_modify, 0),
+  NAMED_PARAM (register_offset, 0),
+  NAMED_PARAM (register_extend, 1),
+  NAMED_PARAM (imm_offset, 0),
+};
+
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   NAMED_PARAM (GP2GP, 1),
@@ -262,6 +283,17 @@ static const struct cpu_regmove_cost thunderx_regmove_cost =
   NAMED_PARAM (FP2FP, 4)
 };
 
+static const struct cpu_regmove_cost xgene1_regmove_cost =
+{
+  NAMED_PARAM (GP2GP, 1),
+  NAMED_PARAM (GP2FP, 8),
+  NAMED_PARAM (FP2GP, 8),
+  /* We currently do not provide direct support for TFmode Q->Q move.
+     Therefore we need to raise the cost above 2 in order to have
+     reload handle the situation.  */
+  NAMED_PARAM (FP2FP, 4)
+};
+
 /* Generic costs for vector insn classes.  */
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
@@ -302,6 +334,26 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 };
 
+/* Generic costs for vector insn classes.  */
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
+static const struct cpu_vector_cost xgene1_vector_cost =
+{
+  NAMED_PARAM (scalar_stmt_cost, 1),
+  NAMED_PARAM (scalar_load_cost, 5),
+  NAMED_PARAM (scalar_store_cost, 1),
+  NAMED_PARAM (vec_stmt_cost, 2),
+  NAMED_PARAM (vec_to_scalar_cost, 4),
+  NAMED_PARAM (scalar_to_vec_cost, 4),
+  NAMED_PARAM (vec_align_load_cost, 10),
+  NAMED_PARAM (vec_unalign_load_cost, 10),
+  NAMED_PARAM (vec_unalign_store_cost, 2),
+  NAMED_PARAM (vec_store_cost, 2),
+  NAMED_PARAM (cond_taken_branch_cost, 2),
+  NAMED_PARAM (cond_not_taken_branch_cost, 1)
+};
+
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
@@ -345,6 +397,16 @@ static const struct tune_params thunderx_tunings =
   NAMED_PARAM (issue_rate, 2)
 };
 
+static const struct tune_params xgene1_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  NAMED_PARAM (memmov_cost, 4),
+  NAMED_PARAM (issue_rate, 4)
+};
+
 /* A processor implementing AArch64.  */
 struct processor
 {
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index adf8708..a6313d6 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -325,4 +325,105 @@ const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table xgene1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (1), /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    COSTS_N_INSNS (1), /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    COSTS_N_INSNS (1), /* extend.  */
+    0,                 /* extend_arithm.  */
+    COSTS_N_INSNS (1), /* bfi.  */
+    COSTS_N_INSNS (1), /* bfx.  */
+    0,                 /* clz.  */
+    COSTS_N_INSNS (1), /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (4),       /* simple.  */
+      COSTS_N_INSNS (4),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (4),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (20)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (5),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (5),       /* extend.  */
+      COSTS_N_INSNS (5),       /* add.  */
+      COSTS_N_INSNS (5),       /* extend_add.  */
+      COSTS_N_INSNS (21)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (5),         /* load.  */
+    COSTS_N_INSNS (6),         /* load_sign_extend.  */
+    COSTS_N_INSNS (5),         /* ldrd.  */
+    COSTS_N_INSNS (5),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    1,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (10),        /* loadf.  */
+    COSTS_N_INSNS (10),        /* loadd.  */
+    COSTS_N_INSNS (5),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    1,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (23),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub. */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (5),       /* addsub.  */
+      COSTS_N_INSNS (2),       /* fpconst. */
+      COSTS_N_INSNS (3),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (6),       /* widen.  */
+      COSTS_N_INSNS (6),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (4)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (29),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub.  */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (5),       /* addsub.  */
+      COSTS_N_INSNS (3),       /* fpconst.  */
+      COSTS_N_INSNS (3),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (6),       /* widen.  */
+      COSTS_N_INSNS (6),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (4)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)  /* alu.  */
+  }
+};
+
 #endif /* GCC_AARCH_COST_TABLES_H */
-- 
1.9.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 0/2, AArch64] APM X-Gene 1 cost-table and pipeline model
@ 2014-11-19 17:34 Philipp Tomsich
  2014-11-19 17:33 ` [PATCH 1/2, AArch64] Core definition for APM XGene-1 and associated cost-table Philipp Tomsich
  2014-11-19 17:36 ` [PATCH 2/2, AArch64] Pipeline model for APM XGene-1 Philipp Tomsich
  0 siblings, 2 replies; 14+ messages in thread
From: Philipp Tomsich @ 2014-11-19 17:34 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus.shawcroft, benedikt.huber, ksankaran, Philipp Tomsich

As briefly discussed with Marcus yesterday, I'm attaching two patches 
to enable a mode accurate instruction selection and scheduling on the 
APM X-Gene 1.

Ok for master?

-Philipp.


Philipp Tomsich (2):
  Core definition for APM XGene-1 and associated cost-table.
  Pipeline model for APM XGene-1.

 gcc/ChangeLog                        |  13 +
 gcc/config/aarch64/aarch64-cores.def |   1 +
 gcc/config/aarch64/aarch64-tune.md   |   2 +-
 gcc/config/aarch64/aarch64.c         |  62 +++
 gcc/config/aarch64/aarch64.md        |   4 +-
 gcc/config/arm/aarch-cost-tables.h   | 101 +++++
 gcc/config/arm/xgene1.md             | 739 +++++++++++++++++++++++++++++++++++
 7 files changed, 920 insertions(+), 2 deletions(-)
 create mode 100644 gcc/config/arm/xgene1.md

-- 
1.9.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
  2014-11-19 17:34 [PATCH 0/2, AArch64] APM X-Gene 1 cost-table and pipeline model Philipp Tomsich
  2014-11-19 17:33 ` [PATCH 1/2, AArch64] Core definition for APM XGene-1 and associated cost-table Philipp Tomsich
@ 2014-11-19 17:36 ` Philipp Tomsich
  2014-11-19 18:08   ` Kyrill Tkachov
                     ` (2 more replies)
  1 sibling, 3 replies; 14+ messages in thread
From: Philipp Tomsich @ 2014-11-19 17:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus.shawcroft, benedikt.huber, ksankaran, Philipp Tomsich

---
 gcc/ChangeLog                 |   6 +
 gcc/config/aarch64/aarch64.md |   4 +-
 gcc/config/arm/xgene1.md      | 739 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 748 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/arm/xgene1.md

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5b389c5..9cc3b5a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,11 @@
 2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
 
+	* config/aarch64/aarch64.md: Include xgene1.md.
+	(generic_sched): Set to no for xgene1.
+	* config/arm/xgene1.md: New file.
+
+2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
 	* config/aarch64/aarch64-cores.def (xgene1): Update/add the
 	xgene1 (APM XGene-1) core definition.
 	* gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..5d92051 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -191,7 +191,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
+          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
           (const_string "no")
           (const_string "yes"))))
 
@@ -4211,3 +4211,5 @@
 
 ;; Atomic Operations
 (include "atomics.md")
+
+(include "../arm/xgene1.md")
diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
new file mode 100644
index 0000000..3c08b16
--- /dev/null
+++ b/gcc/config/arm/xgene1.md
@@ -0,0 +1,739 @@
+;; Machine description for AppliedMicro xgene1 core.
+;; Copyright (C) 2012 Free Software Foundation, Inc.
+;; Contributed by Theobroma Systems Design und Consulting GmbH.
+;;                See http://www.theobroma-systems.com for more info.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Pipeline description for the xgene1 micro-architecture
+
+(define_automaton "xgene1")
+
+(define_cpu_unit "decode_out_0" "xgene1")
+(define_cpu_unit "decode_out_1" "xgene1")
+(define_cpu_unit "decode_out_2" "xgene1")
+(define_cpu_unit "decode_out_3" "xgene1")
+
+(define_cpu_unit "divide" "xgene1")
+(define_cpu_unit "fp_divide" "xgene1")
+
+(define_reservation "decode1op"
+        "( decode_out_0 )
+        |( decode_out_1 )
+        |( decode_out_2 )
+        |( decode_out_3 )"
+)
+(define_reservation "decode2op"
+        "( decode_out_0 + decode_out_1 )
+        |( decode_out_0 + decode_out_2 )
+        |( decode_out_0 + decode_out_3 )
+        |( decode_out_1 + decode_out_2 )
+        |( decode_out_1 + decode_out_3 )
+        |( decode_out_2 + decode_out_3 )"
+)
+(define_reservation "decodeIsolated"
+        "( decode_out_0 + decode_out_1 + decode_out_2 + decode_out_3 )"
+)
+
+;; (define_insn_reservation "dummy" 1
+;;   (and (eq_attr "tune" "xgene1")
+;;        (eq_attr "type" "neon_minmax"))
+;;   "decodeIsolated")
+
+;; B: nop.
+;; BR: branch op.
+
+;; RET
+;; CBZ
+;; TBZ
+(define_insn_reservation "branch" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "branch"))
+  "decode1op")
+
+;; NOP
+;; HINT
+(define_insn_reservation "nop" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "no_insn"))
+  "decode1op")
+
+;; See #3565
+;; BLR: arithmetic op & branch op.
+;; BL: arithmetic op.
+(define_insn_reservation "call" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "call"))
+  "decode2op")
+
+;; LDR: FP load op & arithmetic op.
+(define_insn_reservation "f_load" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_loadd,f_loads"))
+  "decode2op,nothing*9")
+
+;; STR: FP store op & arithmetic op.
+(define_insn_reservation "f_store" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_stored,f_stores"))
+  "decode2op,nothing*3")
+
+;; FMOV (immediate): FP move op.
+;; FMOV (register): FP move op.
+(define_insn_reservation "fmov" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fmov,fconsts,fconstd"))
+  "decode1op,nothing")
+
+;; LDP: FP load op & FP load op.
+;; LDP: FP load op & FP load op & arithmetic op.
+(define_insn_reservation "f_mcr" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_mcr"))
+  "decodeIsolated,nothing*9")
+
+;; STP: FP store op & FP store op.
+(define_insn_reservation "f_mrc" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_mrc"))
+  "decode2op,nothing*3")
+
+;; The (register offset) instructions with a shift
+;; of #0, #2, or #3 (or no shift) are translated
+;; as shown.
+;; For these instructions, any other shift amount
+;; causes the instruction be prefixed with an
+;; sbfm/ubfm op (1 cycle latency).
+
+;; Load/store register pair (post-indexed):
+;; LDP: load op & load op & arithmetic op.
+;; Load/store register pair (offset):
+;; LDP: load op & load op.
+;; Load/store register pair (pre-indexed):
+;; LDP: load op & load op & arithmetic op.
+;; 5 + 1
+(define_insn_reservation "load_pair" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "load2"))
+  "decodeIsolated,nothing*5")
+
+;; Load/store register pair (post-indexed):
+;; STP: store op & store op & arithmetic op.
+;; Load/store register pair (offset):
+;; STP: store op & store op.
+;; Load/store register pair (pre-indexed):
+;; STP: store op & store op & arithmetic op.
+;; 1 + 1
+(define_insn_reservation "store_pair" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "store2"))
+  "decodeIsolated,nothing")
+
+;; Load register (literal):
+;; LDR: load op.
+;; Load/store register (immediate post-indexed):
+;; LDRB/LDRH/LDR: load op & arithmetic op.
+;; Load/store register (immediate pre-indexed):
+;; LDRB/LDRH/LDR: load op & arithmetic op.
+;; Load/store register (register offset)
+;; DRB/LDRH/LDR: load op.
+;; LDRSB/LDRSH/LDRSW: load op + sbfm op (1 cycle latency).
+;; Load/store register (unsigned immediate):
+;; LDRB/LDRH/LDR: load op.
+;; 5 + 1
+;; FIXME This is inaccurate but avoids a crash.
+(define_insn_reservation "load1" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "load1"))
+  "decode2op,nothing")
+
+;; Load/store register (immediate post-indexed):
+;; STRB/STRH/STR: store op & arithmetic op.
+;; Load/store register (immediate pre-indexed):
+;; STRB/STRH/STR: store op & arithmetic op.
+;; Load/store register (register offset)
+;; STRB/STRH/STR: store op.
+;; Load/store register (unsigned immediate):
+;; STRB/STRH/STR: store op.
+;; 1 + 1
+(define_insn_reservation "store1" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "store1"))
+  "decode2op,nothing")
+
+;; MOVI
+;; MOV
+;; Move wide: logical op.
+;; MRS NZCV: logical op (register result).
+(define_insn_reservation "move" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "mov_reg,mov_imm,mrs"))
+  "decode1op")
+
+;; See #3565
+(define_insn_reservation "alu" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
+                        alu_ext,adc_reg,csel,logic_imm,\
+                        logic_reg,logic_shift_imm,clz,\
+                        rbit,shift_reg,adr,mov_reg,\
+                        mov_imm,extend"))
+  "decode1op")
+
+;; REV/REV16/REV32: SIMD op.
+(define_insn_reservation "simd" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "rev"))
+  "decode1op")
+
+;; See #3565
+(define_insn_reservation "alus" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
+                        alus_ext,logics_imm,logics_reg,\
+                        logics_shift_imm"))
+  "decode1op")
+
+;; MADD/SMADDL/UMADDL with Ra=XZR/WZR: multiply op.
+;; MADD/SMADDL/UMADDL with other Ra: multiply op + arithmetic op.
+;; MSUB/SMSUBL/UMSUBL: multiply op + arithmetic op.
+;; SMULH/UMULH: multiply op.
+;; 5 + 1
+(define_insn_reservation "mul" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
+  "decode2op,nothing*5")
+
+;; UDIV/SDIV: divide op.
+(define_insn_reservation "div" 66
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "sdiv,udiv"))
+  "decode1op,divide*65")
+
+;; FCMP/FCMPE: FP compare op.
+(define_insn_reservation "fcmp" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fcmpd,fcmps"))
+  "decode1op,nothing*11")
+
+;; FCSEL: FP select op
+(define_insn_reservation "fcsel" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fcsel"))
+  "decode1op,nothing*2")
+
+;; See #3565
+(define_insn_reservation "bfm" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "bfm"))
+  "decode1op,nothing")
+
+;; FRINTN/FRINTP/FRINTM/FRINTZ/FRINTA/FRINTX/FRINTI:
+;; FP convert op
+(define_insn_reservation "f_rint" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_rintd,f_rints"))
+  "decode1op,nothing*4")
+
+;; FCVT (single to double or double to single): FP arithmetic op.
+;; FCVT (to or from half precision): FP half cvt op.
+(define_insn_reservation "f_cvt" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvt"))
+  "decode1op,nothing*2")
+
+;; Floating-point<->integer conversions:
+;; FCVTNS/FCVTAS/FCVTPS/FCVTMS: FP convert op + FP store op (data bypass path) + (integer) load op.
+;; FCVTNU/FCVTAU/FCVTPU/FCVTMU: FP convert op + FP store op (data bypass path) + (integer) load op.
+;; FCVTZS/FCVTZU (integer): FP convert op + FP store op (data bypass path) + (integer) load op.
+;; Floating-point<->fixed-point conversions:
+;; FCVTZS/FCVTZU (fixed-point): FP convert op + FP store op (data bypass path) + (integer) load op.
+;; 5 + 1 + 5
+(define_insn_reservation "f_cvtf2i" 11
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvtf2i"))
+  "decodeIsolated,nothing*10")
+
+;; SCVTF/UCVTF (integer): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
+;; SCVTF/UCVTF (fixed-point): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
+;; -1 + 10 + 5
+(define_insn_reservation "f_cvti2f" 14
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvti2f"))
+  "decodeIsolated,nothing*13")
+
+;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
+(define_insn_reservation "f_add" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
+  "decode1op,nothing*5")
+
+;; FDIV: FP divide op.
+(define_insn_reservation "f_div" 28
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fdivd,fdivs"))
+  "decode1op,fp_divide*27")
+
+;; FABS/FNEG: FP move op.
+(define_insn_reservation "f_arith" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "ffarithd,ffariths"))
+  "decode1op,nothing")
+
+;; FSQRT: FP sqrt op.
+(define_insn_reservation "f_sqrt" 38
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fsqrtd,fsqrts"))
+  "decode1op,fp_divide*37")
+
+;; FMAX/FMIN/FMAXNM/FMINNM: FP select op.
+(define_insn_reservation "f_select" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_minmaxd,f_minmaxs"))
+  "decode1op,nothing*2")
+
+
+;; SIMD (aka neon)
+
+;; DUP (element) (size=x1000): ASIMD logical op.
+;; DUP (element) (size=other): ASIMD shift op.
+(define_insn_reservation "neon_dup" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_dup,neon_dup_q"))
+  "decode1op,nothing*2")
+
+;; LDR: FP load op & arithmetic op.
+;; LD1 (one register, 1D): FP load op.
+;; LD1 (one register, 2D): FP load op*2.
+;; LD1 (one register, 2S/4H/8B): FP complex load op.
+;; LD1 (one register, 4S/8H/16B): FP complex load op*2.
+(define_insn_reservation "neon_load1" 11
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
+  "decode2op,nothing*10")
+
+;; STR: FP store op & arithmetic op.
+;; ST1 (one register, 1D): FP store op
+;; ST1 (one register, 2D): FP store op*2
+;; ST1 (one register, 2S/4H/8B): FP complex store op
+;; ST1 (one register, 4S/8H/16B): FP complex store op*2
+;; 4 + 1
+(define_insn_reservation "neon_store1" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
+  "decode2op,nothing*4")
+
+;; MOVI/MVNI/ORR/BIC/FMOV: ASIMD logical op^Q.
+;; AND/BIC/ORR/ORN/EOR/BSL/BIT/BIF: ASIMD logical op^Q.
+(define_insn_reservation "neon_logic" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_logic,\
+                        neon_logic_q,\
+                        neon_bsl,\
+                        neon_bsl_q,\
+                        neon_move,\
+                        neon_move_q,\
+                       "))
+  "decode1op,nothing")
+;; N.B. ^Q means that it only uses one decode slot.
+
+;; UMOV (imm5=xxx00): FP store op (data bypass path) + (integer) load op.
+;; UMOV (imm5=other): FP store op (data bypass path) + (integer) load op + ubfm op (1 cycle latency).
+;; 1 + 5 + 1
+(define_insn_reservation "neon_umov" 7
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
+  "decodeIsolated,nothing*6")
+
+;; INS (element) (imm5=01000): FP move op.
+;; INS (element) (imm5=other): ASIMD shift op + ASIMD insert op.
+;; INS (general register) (imm5=01000): (integer) store op + FP load op.
+;; INS (general register) (imm5=other): (integer) store op + FP load op + ASIMD insert op.
+;; 1 + 10 + 3
+(define_insn_reservation "neon_ins" 14
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_from_gp,\
+                        neon_from_gp_q,\
+                        neon_ins,\
+                        neon_ins_q,\
+                       "))
+  "decodeIsolated,nothing*13")
+
+;; USHR/URSHR: ASIMD shift op.
+;; USHR/URSHR: ASIMD shift op^Q.
+;; SSHR/SRSHR: ASIMD shift op.
+;; SSHR/SRSHR: ASIMD shift op^Q.
+;; SHL/SQSHL/SQSHLU: ASIMD shift op.
+;; SHL/SQSHL/SQSHLU: ASIMD shift op^Q.
+;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op.
+;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op^Q.
+;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op.
+;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op^Q.
+;; XTN/SQXTN/UQXTN/SQXTUN/SHLL: ASIMD shift op.
+;; SSHLL/USHLL: ASIMD shift op*2.
+(define_insn_reservation "neon_shift" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_shift_imm,\
+                        neon_shift_imm_q,\
+                        neon_shift_reg,\
+                        neon_shift_reg_q,\
+                        neon_shift_imm_long,\
+                        neon_sat_shift_imm,\
+                        neon_sat_shift_imm_q,\
+                        neon_sat_shift_imm_narrow_q,\
+                        neon_sat_shift_reg,\
+                        neon_sat_shift_reg_q,\
+                        neon_shift_imm_narrow_q,\
+                       "))
+  "decode1op,nothing*2")
+
+;; ADD/SUB: ASIMD arithmetic op.
+;; ADD/SUB/ADDP: ASIMD arithmetic op^Q.
+;; SMAX/SMIN/SABD/SMAXP/SMINP: ASIMD arithmetic op^Q.
+;; UMAX/UMIN/UABD/UMAXP/UMINP: ASIMD arithmetic op^Q.
+;; USQADD/ABS/NEG: ASIMD arithmetic op.
+;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
+;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
+;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
+
+;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
+;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
+;; SADDLP/SUQADD/SQABS/SQNEG: ASIMD arithmetic op^Q.
+;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
+
+;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op.
+;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op.
+;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op^Q.
+;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op^Q.
+(define_insn_reservation "neon_arith" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_add,\
+                        neon_add_q,\
+                        neon_sub,\
+                        neon_sub_q,\
+                        neon_neg,\
+                        neon_neg_q,\
+                        neon_abs,\
+                        neon_abs_q,\
+                        neon_abd_q,\
+                        neon_arith_acc,\
+                        neon_arith_acc_q,\
+                        neon_reduc_add,\
+                        neon_reduc_add_q,\
+                        neon_add_halve,\
+                        neon_add_halve_q,\
+                        neon_sub_halve,\
+                        neon_sub_halve_q,\
+                        neon_qadd,\
+                        neon_qadd_q,\
+                        neon_compare,\
+                        neon_compare_q,\
+                        neon_compare_zero,\
+                        neon_compare_zero_q,\
+                        neon_tst,\
+                        neon_tst_q,\
+                       "))
+  "decode1op,nothing*2")
+
+;; SABA/UABA: (ASIMD arithmetic op + ASIMD arithmetic op)^Q.
+;; 3*3
+(define_insn_reservation "neon_abs_diff" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
+  "decode2op,nothing*5")
+
+;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
+;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
+;; SMLAL/SMLSL/SMULL/SQDMLAL/SQDMLSL/SQDMULL: ASIMD multiply op*2.
+;; SMULL/SMLAL/SMLSL (by element): ASIMD multiply op*2.
+;; UMULL/UMLAL/UMLSL (by element): ASIMD multiply op*2.
+;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op.
+;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op^Q.
+
+;; SQDMULH/SQRDMULH: ASIMD multiply op.
+;; SQDMULH/SQRDMULH (by element): ASIMD multiply op.
+;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
+;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
+
+;; SQDMULL/SQDMLAL/SQDMLSL (by element): ASIMD multiply op*2.
+;; SQDMULL/SQDMLAL/SQDMLSL: ASIMD multiply op.
+(define_insn_reservation "neon_mul" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_mul_b,\
+                        neon_mul_b_q,\
+                        neon_mul_h,\
+                        neon_mul_h_q,\
+                        neon_mul_s,\
+                        neon_mul_s_q,\
+                        neon_fp_mul_s_scalar,\
+                        neon_fp_mul_s_scalar_q,\
+                        neon_fp_mul_d_scalar_q,\
+                        neon_mla_b,neon_mla_b_q,\
+                        neon_mla_h,neon_mla_h_q,\
+                        neon_mla_s,neon_mla_s_q,\
+                        neon_mla_h_scalar,\
+                        neon_mla_h_scalar_q,\
+                        neon_mla_s_scalar,\
+                        neon_mla_s_scalar_q,\
+                        neon_mla_b_long,\
+                        neon_mla_h_long,\
+                        neon_mla_s_long,\
+                        neon_fp_mul_s,\
+                        neon_fp_mul_s_q,\
+                        neon_fp_mul_d,\
+                        neon_fp_mul_d_q,\
+                        neon_fp_mla_s,\
+                        neon_fp_mla_s_q,\
+                        neon_fp_mla_d,\
+                        neon_fp_mla_d_q,\
+                        neon_fp_mla_s_scalar,\
+                        neon_fp_mla_s_scalar_q,\
+                        neon_fp_mla_d_scalar_q,\
+                        neon_sat_mul_b,\
+                        neon_sat_mul_b_q,\
+                        neon_sat_mul_h,\
+                        neon_sat_mul_h_q,\
+                        neon_sat_mul_s,\
+                        neon_sat_mul_s_q,\
+                        neon_sat_mul_h_scalar,\
+                        neon_sat_mul_h_scalar_q,\
+                        neon_sat_mul_s_scalar,\
+                        neon_sat_mul_s_scalar_q,\
+                        neon_sat_mul_h_scalar_long,\
+                        neon_sat_mul_s_scalar_long,\
+                        neon_sat_mla_b_long,\
+                        neon_sat_mla_h_long,\
+                        neon_sat_mla_s_long,\
+                        neon_sat_mla_h_scalar_long,\
+                        neon_sat_mla_s_scalar_long,\
+                       "))
+  "decode2op,nothing*4")
+
+;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
+;; FABD: FP arithmetic op^Q.
+(define_insn_reservation "fp_abd_diff" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_abd_s,\
+                        neon_fp_abd_s_q,\
+                        neon_fp_abd_d,\
+                        neon_fp_abd_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; See #3565
+;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
+(define_insn_reservation "neon_f_add" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_addsub_s,\
+                        neon_fp_addsub_s_q,\
+                        neon_fp_addsub_d,\
+                        neon_fp_addsub_d_q,\
+                       "))
+  "decode1op,nothing*5")
+
+;; FDIV: FP divide op^Q.
+(define_insn_reservation "neon_f_div" 28
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_div_s,\
+                        neon_fp_div_s_q,\
+                        neon_fp_div_d,\
+                        neon_fp_div_d_q,\
+                       "))
+  "decode1op,fp_divide*27")
+
+;; FABS/FNEG: FP move op^Q.
+(define_insn_reservation "neon_f_neg" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_neg_s,\
+                        neon_fp_neg_s_q,\
+                        neon_fp_neg_d,\
+                        neon_fp_neg_d_q,\
+                        neon_fp_abs_s,\
+                        neon_fp_abs_s_q,\
+                        neon_fp_abs_d,\
+                        neon_fp_abs_d_q,\
+                       "))
+  "decode1op,nothing")
+
+;; FRINTN/FRINTM/FRINTA/FRINTP/FRINTZ/FRINTX/FRINTI: FP convert op^Q.
+(define_insn_reservation "neon_f_round" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_round_s,\
+                        neon_fp_round_s_q,\
+                        neon_fp_round_d,\
+                        neon_fp_round_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; FCVTNS/FCVTMS/FCVTAS/FCVTPS: FP convert op^Q.
+;; FCVTNU/FCVTMU/FCVTAU/FCVTPU: FP convert op^Q.
+;; FCVTZS/FCVTZU (integer): FP convert op^Q.
+;; FCVTN/FCVTL (size=0): FP half cvt op.
+(define_insn_reservation "neon_f_cvt" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type"  "neon_int_to_fp_s,\
+                         neon_int_to_fp_s_q,\
+                         neon_int_to_fp_d,\
+                         neon_int_to_fp_d_q,\
+                         neon_fp_cvt_widen_s,\
+                         neon_fp_cvt_narrow_s_q,\
+                         neon_fp_cvt_narrow_d_q,\
+                        "))
+  "decode1op,nothing*4")
+
+;; FADD/FSUB/FMULX/FMLA/FMLS/FADDP: FP arithmetic op^Q.
+(define_insn_reservation "neon_f_reduc" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_reduc_add_s,\
+                        neon_fp_reduc_add_s_q,\
+                        neon_fp_reduc_add_d,\
+                        neon_fp_reduc_add_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; CLS/CLZ/CNT/NOT/RBIT: ASIMD logical op^Q.
+;; PMUL: ASIMD logical op^Q.
+(define_insn_reservation "neon_cls" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_cls,neon_cls_q"))
+  "decode1op,nothing")
+
+;; ST1 (one register, 1D): FP store op.
+(define_insn_reservation "neon_st1" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_store1_one_lane,\
+                        neon_store1_one_lane_q,\
+                       "))
+  "decode1op,nothing*3")
+
+;; ADDHN/SUBHN/RADDHN/RSUBHN: ASIMD arithmetic op*2 + ASIMD shift op.
+;; 3 + 3
+(define_insn_reservation "neon_halve_narrow" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_sub_halve_narrow_q,\
+                        neon_add_halve_narrow_q,\
+                       "))
+  "decodeIsolated,nothing*5")
+
+;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op).
+;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op).
+;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
+;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
+;; 3 + 3
+(define_insn_reservation "neon_shift_acc" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_shift_acc,\
+                        neon_shift_acc_q,\
+                       "))
+  "decode2op,nothing*5")
+
+;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op.
+;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op.
+;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op^Q.
+;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op^Q.
+(define_insn_reservation "neon_fp_compare" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_compare_s,\
+                        neon_fp_compare_s_q,\
+                        neon_fp_compare_d,\
+                        neon_fp_compare_d_q,\
+                       "))
+  "decode1op,nothing*2")
+
+;; FSQRT: FP sqrt op.
+(define_insn_reservation "neon_fp_sqrt" 38
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_sqrt_s,\
+                        neon_fp_sqrt_s_q,\
+                        neon_fp_sqrt_d,\
+                        neon_fp_sqrt_d_q,\
+                       "))
+  "decode1op,fp_divide*37")
+
+;; See #3566
+;; TBL/TBX (single register table): (ASIMD logical op + ASIMD logical op)^Q.
+;; 2 + 2
+(define_insn_reservation "neon_tbl1" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_tbl1,\
+                        neon_tbl1_q,\
+                       "))
+  "decode2op,nothing*2")
+
+;; TBL/TBX (two register table): (ASIMD logical op + ASIMD logical op + ASIMD logical op + ASIMD logical op)^Q.
+;; 2 + 2 + 2 + 2
+(define_insn_reservation "neon_tbl2" 8
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_tbl2,\
+                        neon_tbl2_q,\
+                       "))
+  "decodeIsolated,nothing*7")
+
+;; See #3565
+;; ZIP1/ZIP2/UZP1/UZP2 (Q=0): ASIMD shift op.
+;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=11): ASIMD logical op*2.
+;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=other): ASIMD shift op*2.
+;; TRN1/TRN2 (size=11): ASIMD logical op*2.
+;; TRN1/TRN2 (size=other): ASIMD shift op^Q.
+(define_insn_reservation "neon_permute" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_permute,\
+                        neon_permute_q,\
+                       "))
+  "decode2op,nothing*2")
+
+;; LD1R: FP load op.
+(define_insn_reservation "neon_ld1r" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_load1_all_lanes,\
+                       "))
+  "decode1op,nothing*9")
+
+;; FRECPE/FRECPX: ASIMD dre op.
+;; FRECPE/FRECPX: ASIMD dre op.
+(define_insn_reservation "neon_fp_recp" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_recpe_s,\
+                        neon_fp_recpe_s_q,\
+                        neon_fp_recpe_d,\
+                        neon_fp_recpe_d_q,\
+                        neon_fp_recpx_s,\
+                        neon_fp_recpx_s_q,\
+                        neon_fp_recpx_d,\
+                        neon_fp_recpx_d_q,\
+                       "))
+  "decode1op,nothing*2")
+
+
+;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
+;; FRECPS/FRSQRTS: FP arithmetic op^Q.
+(define_insn_reservation "neon_fp_recp_s" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_recps_s,\
+                        neon_fp_recps_s_q,\
+                        neon_fp_recps_d,\
+                        neon_fp_recps_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; See #3566
+;; PMULL: ASIMD polymul op*2.
+(define_insn_reservation "neon_pmull" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_mul_d_long,\
+                       "))
+  "decode2op,nothing*4")
-- 
1.9.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2, AArch64] Core definition for APM XGene-1 and associated cost-table.
  2014-11-19 17:33 ` [PATCH 1/2, AArch64] Core definition for APM XGene-1 and associated cost-table Philipp Tomsich
@ 2014-11-19 18:00   ` Kyrill Tkachov
  2014-11-19 18:02     ` [PATCH 1/2, AArch64, v2] " Philipp Tomsich
  0 siblings, 1 reply; 14+ messages in thread
From: Kyrill Tkachov @ 2014-11-19 18:00 UTC (permalink / raw)
  To: Philipp Tomsich, gcc-patches; +Cc: marcus.shawcroft, benedikt.huber, ksankaran

Hi Philipp,

The new -mcpu option needs documenting in invoke.texi and a note to 
wwwdocs in changes.html would be nice too.

Kyrill

On 19/11/14 17:32, Philipp Tomsich wrote:
> To keep this change separately buildable from the pipeline model,
> this patch directs the APM XGene-1 to use the generic scheduling
> model.
> ---
>   gcc/ChangeLog                        |   7 +++
>   gcc/config/aarch64/aarch64-cores.def |   1 +
>   gcc/config/aarch64/aarch64-tune.md   |   2 +-
>   gcc/config/aarch64/aarch64.c         |  62 +++++++++++++++++++++
>   gcc/config/arm/aarch-cost-tables.h   | 101 +++++++++++++++++++++++++++++++++++
>   5 files changed, 172 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 2fa58ca..5b389c5 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
> +
> +	* config/aarch64/aarch64-cores.def (xgene1): Update/add the
> +	xgene1 (APM XGene-1) core definition.
> +	* gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
> +	* config/arm/aarch-cost-tables.h: Add cost tables for APM XGene-1
> +
>   2014-11-18  Maciej W. Rozycki  <macro@codesourcery.com>
>   
>   	* config/mips/mips.md (compression): Add `micromips32' setting.
> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
> index 312941f..e553e50 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -37,6 +37,7 @@
>   AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC, cortexa53)
>   AARCH64_CORE("cortex-a57",  cortexa15, cortexa15, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC, cortexa57)
>   AARCH64_CORE("thunderx",    thunderx,  thunderx, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx)
> +AARCH64_CORE("xgene1",      xgene1,    xgene1,    8,  AARCH64_FL_FPSIMD, xgene1)
>   
>   /* V8 big.LITTLE implementations.  */
>   
> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
> index c717ea8..6409082 100644
> --- a/gcc/config/aarch64/aarch64-tune.md
> +++ b/gcc/config/aarch64/aarch64-tune.md
> @@ -1,5 +1,5 @@
>   ;; -*- buffer-read-only: t -*-
>   ;; Generated automatically by gentune.sh from aarch64-cores.def
>   (define_attr "tune"
> -	"cortexa53,cortexa15,thunderx,cortexa57cortexa53"
> +	"cortexa53,cortexa15,thunderx,xgene1,cortexa57cortexa53"
>   	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 4fec21e..9b92527 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -226,6 +226,27 @@ static const struct cpu_addrcost_table cortexa57_addrcost_table =
>   #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
>   __extension__
>   #endif
> +static const struct cpu_addrcost_table xgene1_addrcost_table =
> +{
> +#if HAVE_DESIGNATED_INITIALIZERS
> +  .addr_scale_costs =
> +#endif
> +    {
> +      NAMED_PARAM (hi, 1),
> +      NAMED_PARAM (si, 0),
> +      NAMED_PARAM (di, 0),
> +      NAMED_PARAM (ti, 1),
> +    },
> +  NAMED_PARAM (pre_modify, 1),
> +  NAMED_PARAM (post_modify, 0),
> +  NAMED_PARAM (register_offset, 0),
> +  NAMED_PARAM (register_extend, 1),
> +  NAMED_PARAM (imm_offset, 0),
> +};
> +
> +#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
> +__extension__
> +#endif
>   static const struct cpu_regmove_cost generic_regmove_cost =
>   {
>     NAMED_PARAM (GP2GP, 1),
> @@ -262,6 +283,17 @@ static const struct cpu_regmove_cost thunderx_regmove_cost =
>     NAMED_PARAM (FP2FP, 4)
>   };
>   
> +static const struct cpu_regmove_cost xgene1_regmove_cost =
> +{
> +  NAMED_PARAM (GP2GP, 1),
> +  NAMED_PARAM (GP2FP, 8),
> +  NAMED_PARAM (FP2GP, 8),
> +  /* We currently do not provide direct support for TFmode Q->Q move.
> +     Therefore we need to raise the cost above 2 in order to have
> +     reload handle the situation.  */
> +  NAMED_PARAM (FP2FP, 4)
> +};
> +
>   /* Generic costs for vector insn classes.  */
>   #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
>   __extension__
> @@ -302,6 +334,26 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
>     NAMED_PARAM (cond_not_taken_branch_cost, 1)
>   };
>   
> +/* Generic costs for vector insn classes.  */
> +#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
> +__extension__
> +#endif
> +static const struct cpu_vector_cost xgene1_vector_cost =
> +{
> +  NAMED_PARAM (scalar_stmt_cost, 1),
> +  NAMED_PARAM (scalar_load_cost, 5),
> +  NAMED_PARAM (scalar_store_cost, 1),
> +  NAMED_PARAM (vec_stmt_cost, 2),
> +  NAMED_PARAM (vec_to_scalar_cost, 4),
> +  NAMED_PARAM (scalar_to_vec_cost, 4),
> +  NAMED_PARAM (vec_align_load_cost, 10),
> +  NAMED_PARAM (vec_unalign_load_cost, 10),
> +  NAMED_PARAM (vec_unalign_store_cost, 2),
> +  NAMED_PARAM (vec_store_cost, 2),
> +  NAMED_PARAM (cond_taken_branch_cost, 2),
> +  NAMED_PARAM (cond_not_taken_branch_cost, 1)
> +};
> +
>   #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
>   __extension__
>   #endif
> @@ -345,6 +397,16 @@ static const struct tune_params thunderx_tunings =
>     NAMED_PARAM (issue_rate, 2)
>   };
>   
> +static const struct tune_params xgene1_tunings =
> +{
> +  &xgene1_extra_costs,
> +  &xgene1_addrcost_table,
> +  &xgene1_regmove_cost,
> +  &xgene1_vector_cost,
> +  NAMED_PARAM (memmov_cost, 4),
> +  NAMED_PARAM (issue_rate, 4)
> +};
> +
>   /* A processor implementing AArch64.  */
>   struct processor
>   {
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
> index adf8708..a6313d6 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -325,4 +325,105 @@ const struct cpu_cost_table cortexa57_extra_costs =
>     }
>   };
>   
> +const struct cpu_cost_table xgene1_extra_costs =
> +{
> +  /* ALU */
> +  {
> +    0,                 /* arith.  */
> +    0,                 /* logical.  */
> +    0,                 /* shift.  */
> +    COSTS_N_INSNS (1), /* shift_reg.  */
> +    COSTS_N_INSNS (1), /* arith_shift.  */
> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> +    COSTS_N_INSNS (1), /* log_shift.  */
> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> +    COSTS_N_INSNS (1), /* extend.  */
> +    0,                 /* extend_arithm.  */
> +    COSTS_N_INSNS (1), /* bfi.  */
> +    COSTS_N_INSNS (1), /* bfx.  */
> +    0,                 /* clz.  */
> +    COSTS_N_INSNS (1), /* rev.  */
> +    0,                 /* non_exec.  */
> +    true               /* non_exec_costs_exec.  */
> +  },
> +  {
> +    /* MULT SImode */
> +    {
> +      COSTS_N_INSNS (4),       /* simple.  */
> +      COSTS_N_INSNS (4),       /* flag_setting.  */
> +      COSTS_N_INSNS (4),       /* extend.  */
> +      COSTS_N_INSNS (4),       /* add.  */
> +      COSTS_N_INSNS (4),       /* extend_add.  */
> +      COSTS_N_INSNS (20)       /* idiv.  */
> +    },
> +    /* MULT DImode */
> +    {
> +      COSTS_N_INSNS (5),       /* simple.  */
> +      0,                       /* flag_setting (N/A).  */
> +      COSTS_N_INSNS (5),       /* extend.  */
> +      COSTS_N_INSNS (5),       /* add.  */
> +      COSTS_N_INSNS (5),       /* extend_add.  */
> +      COSTS_N_INSNS (21)       /* idiv.  */
> +    }
> +  },
> +  /* LD/ST */
> +  {
> +    COSTS_N_INSNS (5),         /* load.  */
> +    COSTS_N_INSNS (6),         /* load_sign_extend.  */
> +    COSTS_N_INSNS (5),         /* ldrd.  */
> +    COSTS_N_INSNS (5),         /* ldm_1st.  */
> +    1,                         /* ldm_regs_per_insn_1st.  */
> +    1,                         /* ldm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (10),        /* loadf.  */
> +    COSTS_N_INSNS (10),        /* loadd.  */
> +    COSTS_N_INSNS (5),         /* load_unaligned.  */
> +    0,                         /* store.  */
> +    0,                         /* strd.  */
> +    0,                         /* stm_1st.  */
> +    1,                         /* stm_regs_per_insn_1st.  */
> +    1,                         /* stm_regs_per_insn_subsequent.  */
> +    0,                         /* storef.  */
> +    0,                         /* stored.  */
> +    0,                         /* store_unaligned.  */
> +  },
> +  {
> +    /* FP SFmode */
> +    {
> +      COSTS_N_INSNS (23),      /* div.  */
> +      COSTS_N_INSNS (5),       /* mult.  */
> +      COSTS_N_INSNS (5),       /* mult_addsub. */
> +      COSTS_N_INSNS (5),       /* fma.  */
> +      COSTS_N_INSNS (5),       /* addsub.  */
> +      COSTS_N_INSNS (2),       /* fpconst. */
> +      COSTS_N_INSNS (3),       /* neg.  */
> +      COSTS_N_INSNS (2),       /* compare.  */
> +      COSTS_N_INSNS (6),       /* widen.  */
> +      COSTS_N_INSNS (6),       /* narrow.  */
> +      COSTS_N_INSNS (4),       /* toint.  */
> +      COSTS_N_INSNS (4),       /* fromint.  */
> +      COSTS_N_INSNS (4)        /* roundint.  */
> +    },
> +    /* FP DFmode */
> +    {
> +      COSTS_N_INSNS (29),      /* div.  */
> +      COSTS_N_INSNS (5),       /* mult.  */
> +      COSTS_N_INSNS (5),       /* mult_addsub.  */
> +      COSTS_N_INSNS (5),       /* fma.  */
> +      COSTS_N_INSNS (5),       /* addsub.  */
> +      COSTS_N_INSNS (3),       /* fpconst.  */
> +      COSTS_N_INSNS (3),       /* neg.  */
> +      COSTS_N_INSNS (2),       /* compare.  */
> +      COSTS_N_INSNS (6),       /* widen.  */
> +      COSTS_N_INSNS (6),       /* narrow.  */
> +      COSTS_N_INSNS (4),       /* toint.  */
> +      COSTS_N_INSNS (4),       /* fromint.  */
> +      COSTS_N_INSNS (4)        /* roundint.  */
> +    }
> +  },
> +  /* Vector */
> +  {
> +    COSTS_N_INSNS (1)  /* alu.  */
> +  }
> +};
> +
>   #endif /* GCC_AARCH_COST_TABLES_H */


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2, AArch64, v2] Core definition for APM XGene-1 and associated cost-table.
  2014-11-19 18:00   ` Kyrill Tkachov
@ 2014-11-19 18:02     ` Philipp Tomsich
  0 siblings, 0 replies; 14+ messages in thread
From: Philipp Tomsich @ 2014-11-19 18:02 UTC (permalink / raw)
  To: gcc-patches, kyrylo.tkachov
  Cc: marcus.shawcroft, benedikt.huber, ksankaran, Philipp Tomsich

To keep this change separately buildable from the pipeline model,
this patch directs the APM XGene-1 to use the generic scheduling
model.

v2: Revised to document -mcpu=xgene1 in invoke.texi

---
 gcc/ChangeLog                        |   8 +++
 gcc/config/aarch64/aarch64-cores.def |   1 +
 gcc/config/aarch64/aarch64-tune.md   |   2 +-
 gcc/config/aarch64/aarch64.c         |  62 +++++++++++++++++++++
 gcc/config/arm/aarch-cost-tables.h   | 101 +++++++++++++++++++++++++++++++++++
 gcc/doc/invoke.texi                  |   3 +-
 6 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2fa58ca..c9ac0d9 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-cores.def (xgene1): Update/add the
+	xgene1 (APM XGene-1) core definition.
+	* gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
+	* config/arm/aarch-cost-tables.h: Add cost tables for APM XGene-1
+	* doc/invoke.texi: Document -mcpu=xgene1.
+
 2014-11-18  Maciej W. Rozycki  <macro@codesourcery.com>
 
 	* config/mips/mips.md (compression): Add `micromips32' setting.
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 312941f..e553e50 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -37,6 +37,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC, cortexa53)
 AARCH64_CORE("cortex-a57",  cortexa15, cortexa15, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC, cortexa57)
 AARCH64_CORE("thunderx",    thunderx,  thunderx, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx)
+AARCH64_CORE("xgene1",      xgene1,    xgene1,    8,  AARCH64_FL_FPSIMD, xgene1)
 
 /* V8 big.LITTLE implementations.  */
 
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index c717ea8..6409082 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa53,cortexa15,thunderx,cortexa57cortexa53"
+	"cortexa53,cortexa15,thunderx,xgene1,cortexa57cortexa53"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4fec21e..9b92527 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -226,6 +226,27 @@ static const struct cpu_addrcost_table cortexa57_addrcost_table =
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
+static const struct cpu_addrcost_table xgene1_addrcost_table =
+{
+#if HAVE_DESIGNATED_INITIALIZERS
+  .addr_scale_costs =
+#endif
+    {
+      NAMED_PARAM (hi, 1),
+      NAMED_PARAM (si, 0),
+      NAMED_PARAM (di, 0),
+      NAMED_PARAM (ti, 1),
+    },
+  NAMED_PARAM (pre_modify, 1),
+  NAMED_PARAM (post_modify, 0),
+  NAMED_PARAM (register_offset, 0),
+  NAMED_PARAM (register_extend, 1),
+  NAMED_PARAM (imm_offset, 0),
+};
+
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   NAMED_PARAM (GP2GP, 1),
@@ -262,6 +283,17 @@ static const struct cpu_regmove_cost thunderx_regmove_cost =
   NAMED_PARAM (FP2FP, 4)
 };
 
+static const struct cpu_regmove_cost xgene1_regmove_cost =
+{
+  NAMED_PARAM (GP2GP, 1),
+  NAMED_PARAM (GP2FP, 8),
+  NAMED_PARAM (FP2GP, 8),
+  /* We currently do not provide direct support for TFmode Q->Q move.
+     Therefore we need to raise the cost above 2 in order to have
+     reload handle the situation.  */
+  NAMED_PARAM (FP2FP, 4)
+};
+
 /* Generic costs for vector insn classes.  */
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
@@ -302,6 +334,26 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 };
 
+/* Generic costs for vector insn classes.  */
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
+static const struct cpu_vector_cost xgene1_vector_cost =
+{
+  NAMED_PARAM (scalar_stmt_cost, 1),
+  NAMED_PARAM (scalar_load_cost, 5),
+  NAMED_PARAM (scalar_store_cost, 1),
+  NAMED_PARAM (vec_stmt_cost, 2),
+  NAMED_PARAM (vec_to_scalar_cost, 4),
+  NAMED_PARAM (scalar_to_vec_cost, 4),
+  NAMED_PARAM (vec_align_load_cost, 10),
+  NAMED_PARAM (vec_unalign_load_cost, 10),
+  NAMED_PARAM (vec_unalign_store_cost, 2),
+  NAMED_PARAM (vec_store_cost, 2),
+  NAMED_PARAM (cond_taken_branch_cost, 2),
+  NAMED_PARAM (cond_not_taken_branch_cost, 1)
+};
+
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
@@ -345,6 +397,16 @@ static const struct tune_params thunderx_tunings =
   NAMED_PARAM (issue_rate, 2)
 };
 
+static const struct tune_params xgene1_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  NAMED_PARAM (memmov_cost, 4),
+  NAMED_PARAM (issue_rate, 4)
+};
+
 /* A processor implementing AArch64.  */
 struct processor
 {
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index adf8708..a6313d6 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -325,4 +325,105 @@ const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table xgene1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (1), /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    COSTS_N_INSNS (1), /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    COSTS_N_INSNS (1), /* extend.  */
+    0,                 /* extend_arithm.  */
+    COSTS_N_INSNS (1), /* bfi.  */
+    COSTS_N_INSNS (1), /* bfx.  */
+    0,                 /* clz.  */
+    COSTS_N_INSNS (1), /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (4),       /* simple.  */
+      COSTS_N_INSNS (4),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (4),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (20)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (5),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (5),       /* extend.  */
+      COSTS_N_INSNS (5),       /* add.  */
+      COSTS_N_INSNS (5),       /* extend_add.  */
+      COSTS_N_INSNS (21)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (5),         /* load.  */
+    COSTS_N_INSNS (6),         /* load_sign_extend.  */
+    COSTS_N_INSNS (5),         /* ldrd.  */
+    COSTS_N_INSNS (5),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    1,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (10),        /* loadf.  */
+    COSTS_N_INSNS (10),        /* loadd.  */
+    COSTS_N_INSNS (5),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    1,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (23),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub. */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (5),       /* addsub.  */
+      COSTS_N_INSNS (2),       /* fpconst. */
+      COSTS_N_INSNS (3),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (6),       /* widen.  */
+      COSTS_N_INSNS (6),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (4)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (29),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub.  */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (5),       /* addsub.  */
+      COSTS_N_INSNS (3),       /* fpconst.  */
+      COSTS_N_INSNS (3),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (6),       /* widen.  */
+      COSTS_N_INSNS (6),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (4)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)  /* alu.  */
+  }
+};
+
 #endif /* GCC_AARCH_COST_TABLES_H */
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89edddb..10f4716bb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -11935,7 +11935,8 @@ architecture.
 @opindex mtune
 Specify the name of the target processor for which GCC should tune the
 performance of the code.  Permissible values for this option are:
-@samp{generic}, @samp{cortex-a53}, @samp{cortex-a57}, @samp{thunderx}.
+@samp{generic}, @samp{cortex-a53}, @samp{cortex-a57}, @samp{thunderx},
+@samp{xgene1}.
 
 Additionally, this option can specify that GCC should tune the performance
 of the code for a big.LITTLE system.  The only permissible value is
-- 
1.9.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
  2014-11-19 17:36 ` [PATCH 2/2, AArch64] Pipeline model for APM XGene-1 Philipp Tomsich
@ 2014-11-19 18:08   ` Kyrill Tkachov
  2014-11-19 18:11   ` Andrew Pinski
  2014-11-19 18:11   ` Kyrill Tkachov
  2 siblings, 0 replies; 14+ messages in thread
From: Kyrill Tkachov @ 2014-11-19 18:08 UTC (permalink / raw)
  To: Philipp Tomsich, gcc-patches; +Cc: marcus.shawcroft, benedikt.huber, ksankaran

Hi Philipp,

Some comments inline

On 19/11/14 17:32, Philipp Tomsich wrote:
> ---
>   gcc/ChangeLog                 |   6 +
>   gcc/config/aarch64/aarch64.md |   4 +-
>   gcc/config/arm/xgene1.md      | 739 ++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 748 insertions(+), 1 deletion(-)
>   create mode 100644 gcc/config/arm/xgene1.md
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 5b389c5..9cc3b5a 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,5 +1,11 @@
>   2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
>
> +       * config/aarch64/aarch64.md: Include xgene1.md.
> +       (generic_sched): Set to no for xgene1.
> +       * config/arm/xgene1.md: New file.
> +
> +2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
> +
>          * config/aarch64/aarch64-cores.def (xgene1): Update/add the
>          xgene1 (APM XGene-1) core definition.
>          * gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 597ff8c..5d92051 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -191,7 +191,7 @@
>
>   (define_attr "generic_sched" "yes,no"
>     (const (if_then_else
> -          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
> +          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
>             (const_string "no")
>             (const_string "yes"))))
>
> @@ -4211,3 +4211,5 @@
>
>   ;; Atomic Operations
>   (include "atomics.md")
> +
> +(include "../arm/xgene1.md")
> diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
> new file mode 100644
> index 0000000..3c08b16
> --- /dev/null
> +++ b/gcc/config/arm/xgene1.md
> @@ -0,0 +1,739 @@
> +;; Machine description for AppliedMicro xgene1 core.
> +;; Copyright (C) 2012 Free Software Foundation, Inc.

Copyright 2012-2014 I think...

> +;; Contributed by Theobroma Systems Design und Consulting GmbH.
> +;;                See http://www.theobroma-systems.com for more info.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; Pipeline description for the xgene1 micro-architecture
> +
> +(define_automaton "xgene1")
> +
> +(define_cpu_unit "decode_out_0" "xgene1")
> +(define_cpu_unit "decode_out_1" "xgene1")
> +(define_cpu_unit "decode_out_2" "xgene1")
> +(define_cpu_unit "decode_out_3" "xgene1")
> +
> +(define_cpu_unit "divide" "xgene1")
> +(define_cpu_unit "fp_divide" "xgene1")
> +
> +(define_reservation "decode1op"
> +        "( decode_out_0 )
> +        |( decode_out_1 )
> +        |( decode_out_2 )
> +        |( decode_out_3 )"
> +)
> +(define_reservation "decode2op"
> +        "( decode_out_0 + decode_out_1 )
> +        |( decode_out_0 + decode_out_2 )
> +        |( decode_out_0 + decode_out_3 )
> +        |( decode_out_1 + decode_out_2 )
> +        |( decode_out_1 + decode_out_3 )
> +        |( decode_out_2 + decode_out_3 )"
> +)
> +(define_reservation "decodeIsolated"
> +        "( decode_out_0 + decode_out_1 + decode_out_2 + decode_out_3 )"
> +)
> +
> +;; (define_insn_reservation "dummy" 1
> +;;   (and (eq_attr "tune" "xgene1")
> +;;        (eq_attr "type" "neon_minmax"))
> +;;   "decodeIsolated")

Remove that commented out unit.

> +
> +;; B: nop.
> +;; BR: branch op.
> +
> +;; RET
> +;; CBZ
> +;; TBZ
> +(define_insn_reservation "branch" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "branch"))
> +  "decode1op")
> +
> +;; NOP
> +;; HINT
> +(define_insn_reservation "nop" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "no_insn"))
> +  "decode1op")
> +
> +;; See #3565
This is not meaningful, it should be removed. Similarly elsewhere in the 
file.

> +;; BLR: arithmetic op & branch op.
> +;; BL: arithmetic op.
> +(define_insn_reservation "call" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "call"))
> +  "decode2op")
> +
> +;; LDR: FP load op & arithmetic op.
> +(define_insn_reservation "f_load" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_loadd,f_loads"))
> +  "decode2op,nothing*9")

you can just write "decode2op". The nothing*9 is implicit if you don't 
specify nothing.
Using the 'nothing' is only useful if you wanted to model the 
reservation of a unit with intermediate 'empty stages' like:
"unit0, nothing*2, unit1". Similarly throughout the file.

> +
> +;; STR: FP store op & arithmetic op.
> +(define_insn_reservation "f_store" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_stored,f_stores"))
> +  "decode2op,nothing*3")
> +
> +;; FMOV (immediate): FP move op.
> +;; FMOV (register): FP move op.
> +(define_insn_reservation "fmov" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fmov,fconsts,fconstd"))
> +  "decode1op,nothing")
> +
> +;; LDP: FP load op & FP load op.
> +;; LDP: FP load op & FP load op & arithmetic op.
> +(define_insn_reservation "f_mcr" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mcr"))
> +  "decodeIsolated,nothing*9")
> +
> +;; STP: FP store op & FP store op.
> +(define_insn_reservation "f_mrc" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mrc"))
> +  "decode2op,nothing*3")
> +
> +;; The (register offset) instructions with a shift
> +;; of #0, #2, or #3 (or no shift) are translated
> +;; as shown.
> +;; For these instructions, any other shift amount
> +;; causes the instruction be prefixed with an
> +;; sbfm/ubfm op (1 cycle latency).
> +
> +;; Load/store register pair (post-indexed):
> +;; LDP: load op & load op & arithmetic op.
> +;; Load/store register pair (offset):
> +;; LDP: load op & load op.
> +;; Load/store register pair (pre-indexed):
> +;; LDP: load op & load op & arithmetic op.
> +;; 5 + 1
> +(define_insn_reservation "load_pair" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load2"))
> +  "decodeIsolated,nothing*5")
> +
> +;; Load/store register pair (post-indexed):
> +;; STP: store op & store op & arithmetic op.
> +;; Load/store register pair (offset):
> +;; STP: store op & store op.
> +;; Load/store register pair (pre-indexed):
> +;; STP: store op & store op & arithmetic op.
> +;; 1 + 1
> +(define_insn_reservation "store_pair" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store2"))
> +  "decodeIsolated,nothing")
> +
> +;; Load register (literal):
> +;; LDR: load op.
> +;; Load/store register (immediate post-indexed):
> +;; LDRB/LDRH/LDR: load op & arithmetic op.
> +;; Load/store register (immediate pre-indexed):
> +;; LDRB/LDRH/LDR: load op & arithmetic op.
> +;; Load/store register (register offset)
> +;; DRB/LDRH/LDR: load op.
> +;; LDRSB/LDRSH/LDRSW: load op + sbfm op (1 cycle latency).
> +;; Load/store register (unsigned immediate):
> +;; LDRB/LDRH/LDR: load op.
> +;; 5 + 1
> +;; FIXME This is inaccurate but avoids a crash.
> +(define_insn_reservation "load1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load1"))
> +  "decode2op,nothing")
> +
> +;; Load/store register (immediate post-indexed):
> +;; STRB/STRH/STR: store op & arithmetic op.
> +;; Load/store register (immediate pre-indexed):
> +;; STRB/STRH/STR: store op & arithmetic op.
> +;; Load/store register (register offset)
> +;; STRB/STRH/STR: store op.
> +;; Load/store register (unsigned immediate):
> +;; STRB/STRH/STR: store op.
> +;; 1 + 1
> +(define_insn_reservation "store1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store1"))
> +  "decode2op,nothing")
> +
> +;; MOVI
> +;; MOV
> +;; Move wide: logical op.
> +;; MRS NZCV: logical op (register result).
> +(define_insn_reservation "move" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mov_reg,mov_imm,mrs"))
> +  "decode1op")
> +
> +;; See #3565
> +(define_insn_reservation "alu" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
> +                        alu_ext,adc_reg,csel,logic_imm,\
> +                        logic_reg,logic_shift_imm,clz,\
> +                        rbit,shift_reg,adr,mov_reg,\
> +                        mov_imm,extend"))
> +  "decode1op")
> +
> +;; REV/REV16/REV32: SIMD op.
> +(define_insn_reservation "simd" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "rev"))
> +  "decode1op")
> +
> +;; See #3565
> +(define_insn_reservation "alus" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
> +                        alus_ext,logics_imm,logics_reg,\
> +                        logics_shift_imm"))
> +  "decode1op")
> +
> +;; MADD/SMADDL/UMADDL with Ra=XZR/WZR: multiply op.
> +;; MADD/SMADDL/UMADDL with other Ra: multiply op + arithmetic op.
> +;; MSUB/SMSUBL/UMSUBL: multiply op + arithmetic op.
> +;; SMULH/UMULH: multiply op.
> +;; 5 + 1
> +(define_insn_reservation "mul" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
> +  "decode2op,nothing*5")
> +
> +;; UDIV/SDIV: divide op.
> +(define_insn_reservation "div" 66
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "sdiv,udiv"))
> +  "decode1op,divide*65")

Such large reservations tend to blow up the state-space of the automaton 
without contributing too much to the codegen quality.
See PR 60743 for an example where it bit us.
Is the automaton size reasonable here?
You can get a feel of how large the automaton becomes by adding the options:
  (automata_option "v")
  (automata_option "time")
  (automata_option "stats")
  (automata_option "progress")

to the .md file and it will show you some stats during genautomata.

> +
> +;; FCMP/FCMPE: FP compare op.
> +(define_insn_reservation "fcmp" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcmpd,fcmps"))
> +  "decode1op,nothing*11")
> +
> +;; FCSEL: FP select op
> +(define_insn_reservation "fcsel" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcsel"))
> +  "decode1op,nothing*2")
> +
> +;; See #3565
> +(define_insn_reservation "bfm" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "bfm"))
> +  "decode1op,nothing")
> +
> +;; FRINTN/FRINTP/FRINTM/FRINTZ/FRINTA/FRINTX/FRINTI:
> +;; FP convert op
> +(define_insn_reservation "f_rint" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_rintd,f_rints"))
> +  "decode1op,nothing*4")
> +
> +;; FCVT (single to double or double to single): FP arithmetic op.
> +;; FCVT (to or from half precision): FP half cvt op.
> +(define_insn_reservation "f_cvt" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvt"))
> +  "decode1op,nothing*2")
> +
> +;; Floating-point<->integer conversions:
> +;; FCVTNS/FCVTAS/FCVTPS/FCVTMS: FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; FCVTNU/FCVTAU/FCVTPU/FCVTMU: FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; FCVTZS/FCVTZU (integer): FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; Floating-point<->fixed-point conversions:
> +;; FCVTZS/FCVTZU (fixed-point): FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; 5 + 1 + 5
> +(define_insn_reservation "f_cvtf2i" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvtf2i"))
> +  "decodeIsolated,nothing*10")
> +
> +;; SCVTF/UCVTF (integer): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
> +;; SCVTF/UCVTF (fixed-point): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
> +;; -1 + 10 + 5
> +(define_insn_reservation "f_cvti2f" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvti2f"))
> +  "decodeIsolated,nothing*13")
> +
> +;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
> +(define_insn_reservation "f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
> +  "decode1op,nothing*5")
> +
> +;; FDIV: FP divide op.
> +(define_insn_reservation "f_div" 28
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fdivd,fdivs"))
> +  "decode1op,fp_divide*27")
> +
> +;; FABS/FNEG: FP move op.
> +(define_insn_reservation "f_arith" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "ffarithd,ffariths"))
> +  "decode1op,nothing")
> +
> +;; FSQRT: FP sqrt op.
> +(define_insn_reservation "f_sqrt" 38
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fsqrtd,fsqrts"))
> +  "decode1op,fp_divide*37")
> +
> +;; FMAX/FMIN/FMAXNM/FMINNM: FP select op.
> +(define_insn_reservation "f_select" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_minmaxd,f_minmaxs"))
> +  "decode1op,nothing*2")
> +
> +
> +;; SIMD (aka neon)
> +
> +;; DUP (element) (size=x1000): ASIMD logical op.
> +;; DUP (element) (size=other): ASIMD shift op.
> +(define_insn_reservation "neon_dup" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_dup,neon_dup_q"))
> +  "decode1op,nothing*2")
> +
> +;; LDR: FP load op & arithmetic op.
> +;; LD1 (one register, 1D): FP load op.
> +;; LD1 (one register, 2D): FP load op*2.
> +;; LD1 (one register, 2S/4H/8B): FP complex load op.
> +;; LD1 (one register, 4S/8H/16B): FP complex load op*2.
> +(define_insn_reservation "neon_load1" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
> +  "decode2op,nothing*10")
> +
> +;; STR: FP store op & arithmetic op.
> +;; ST1 (one register, 1D): FP store op
> +;; ST1 (one register, 2D): FP store op*2
> +;; ST1 (one register, 2S/4H/8B): FP complex store op
> +;; ST1 (one register, 4S/8H/16B): FP complex store op*2
> +;; 4 + 1
> +(define_insn_reservation "neon_store1" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
> +  "decode2op,nothing*4")
> +
> +;; MOVI/MVNI/ORR/BIC/FMOV: ASIMD logical op^Q.
> +;; AND/BIC/ORR/ORN/EOR/BSL/BIT/BIF: ASIMD logical op^Q.
> +(define_insn_reservation "neon_logic" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_logic,\
> +                        neon_logic_q,\
> +                        neon_bsl,\
> +                        neon_bsl_q,\
> +                        neon_move,\
> +                        neon_move_q,\
> +                       "))
> +  "decode1op,nothing")
> +;; N.B. ^Q means that it only uses one decode slot.
> +
> +;; UMOV (imm5=xxx00): FP store op (data bypass path) + (integer) load op.
> +;; UMOV (imm5=other): FP store op (data bypass path) + (integer) load op + ubfm op (1 cycle latency).
> +;; 1 + 5 + 1
> +(define_insn_reservation "neon_umov" 7
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
> +  "decodeIsolated,nothing*6")
> +
> +;; INS (element) (imm5=01000): FP move op.
> +;; INS (element) (imm5=other): ASIMD shift op + ASIMD insert op.
> +;; INS (general register) (imm5=01000): (integer) store op + FP load op.
> +;; INS (general register) (imm5=other): (integer) store op + FP load op + ASIMD insert op.
> +;; 1 + 10 + 3
> +(define_insn_reservation "neon_ins" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_from_gp,\
> +                        neon_from_gp_q,\
> +                        neon_ins,\
> +                        neon_ins_q,\
> +                       "))
> +  "decodeIsolated,nothing*13")
> +
> +;; USHR/URSHR: ASIMD shift op.
> +;; USHR/URSHR: ASIMD shift op^Q.
What are these ^Q, editor artifact?

> +;; SSHR/SRSHR: ASIMD shift op.
> +;; SSHR/SRSHR: ASIMD shift op^Q.
> +;; SHL/SQSHL/SQSHLU: ASIMD shift op.
> +;; SHL/SQSHL/SQSHLU: ASIMD shift op^Q.
> +;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op.
> +;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op^Q.
> +;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op.
> +;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op^Q.
> +;; XTN/SQXTN/UQXTN/SQXTUN/SHLL: ASIMD shift op.
> +;; SSHLL/USHLL: ASIMD shift op*2.
> +(define_insn_reservation "neon_shift" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_imm,\
> +                        neon_shift_imm_q,\
> +                        neon_shift_reg,\
> +                        neon_shift_reg_q,\
> +                        neon_shift_imm_long,\
> +                        neon_sat_shift_imm,\
> +                        neon_sat_shift_imm_q,\
> +                        neon_sat_shift_imm_narrow_q,\
> +                        neon_sat_shift_reg,\
> +                        neon_sat_shift_reg_q,\
> +                        neon_shift_imm_narrow_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +;; ADD/SUB: ASIMD arithmetic op.
> +;; ADD/SUB/ADDP: ASIMD arithmetic op^Q.
> +;; SMAX/SMIN/SABD/SMAXP/SMINP: ASIMD arithmetic op^Q.
> +;; UMAX/UMIN/UABD/UMAXP/UMINP: ASIMD arithmetic op^Q.
> +;; USQADD/ABS/NEG: ASIMD arithmetic op.
> +;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
> +;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
> +;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
> +
> +;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
> +;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
> +;; SADDLP/SUQADD/SQABS/SQNEG: ASIMD arithmetic op^Q.
> +;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
> +
> +;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op.
> +;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op.
> +;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op^Q.
> +;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op^Q.
> +(define_insn_reservation "neon_arith" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_add,\
> +                        neon_add_q,\
> +                        neon_sub,\
> +                        neon_sub_q,\
> +                        neon_neg,\
> +                        neon_neg_q,\
> +                        neon_abs,\
> +                        neon_abs_q,\
> +                        neon_abd_q,\
> +                        neon_arith_acc,\
> +                        neon_arith_acc_q,\
> +                        neon_reduc_add,\
> +                        neon_reduc_add_q,\
> +                        neon_add_halve,\
> +                        neon_add_halve_q,\
> +                        neon_sub_halve,\
> +                        neon_sub_halve_q,\
> +                        neon_qadd,\
> +                        neon_qadd_q,\
> +                        neon_compare,\
> +                        neon_compare_q,\
> +                        neon_compare_zero,\
> +                        neon_compare_zero_q,\
> +                        neon_tst,\
> +                        neon_tst_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +;; SABA/UABA: (ASIMD arithmetic op + ASIMD arithmetic op)^Q.
> +;; 3*3
> +(define_insn_reservation "neon_abs_diff" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
> +  "decode2op,nothing*5")
> +
> +;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
> +;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
> +;; SMLAL/SMLSL/SMULL/SQDMLAL/SQDMLSL/SQDMULL: ASIMD multiply op*2.
> +;; SMULL/SMLAL/SMLSL (by element): ASIMD multiply op*2.
> +;; UMULL/UMLAL/UMLSL (by element): ASIMD multiply op*2.
> +;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op.
> +;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op^Q.
> +
> +;; SQDMULH/SQRDMULH: ASIMD multiply op.
> +;; SQDMULH/SQRDMULH (by element): ASIMD multiply op.
> +;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
> +;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
> +
> +;; SQDMULL/SQDMLAL/SQDMLSL (by element): ASIMD multiply op*2.
> +;; SQDMULL/SQDMLAL/SQDMLSL: ASIMD multiply op.
> +(define_insn_reservation "neon_mul" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_b,\
> +                        neon_mul_b_q,\
> +                        neon_mul_h,\
> +                        neon_mul_h_q,\
> +                        neon_mul_s,\
> +                        neon_mul_s_q,\
> +                        neon_fp_mul_s_scalar,\
> +                        neon_fp_mul_s_scalar_q,\
> +                        neon_fp_mul_d_scalar_q,\
> +                        neon_mla_b,neon_mla_b_q,\
> +                        neon_mla_h,neon_mla_h_q,\
> +                        neon_mla_s,neon_mla_s_q,\
> +                        neon_mla_h_scalar,\
> +                        neon_mla_h_scalar_q,\
> +                        neon_mla_s_scalar,\
> +                        neon_mla_s_scalar_q,\
> +                        neon_mla_b_long,\
> +                        neon_mla_h_long,\
> +                        neon_mla_s_long,\
> +                        neon_fp_mul_s,\
> +                        neon_fp_mul_s_q,\
> +                        neon_fp_mul_d,\
> +                        neon_fp_mul_d_q,\
> +                        neon_fp_mla_s,\
> +                        neon_fp_mla_s_q,\
> +                        neon_fp_mla_d,\
> +                        neon_fp_mla_d_q,\
> +                        neon_fp_mla_s_scalar,\
> +                        neon_fp_mla_s_scalar_q,\
> +                        neon_fp_mla_d_scalar_q,\
> +                        neon_sat_mul_b,\
> +                        neon_sat_mul_b_q,\
> +                        neon_sat_mul_h,\
> +                        neon_sat_mul_h_q,\
> +                        neon_sat_mul_s,\
> +                        neon_sat_mul_s_q,\
> +                        neon_sat_mul_h_scalar,\
> +                        neon_sat_mul_h_scalar_q,\
> +                        neon_sat_mul_s_scalar,\
> +                        neon_sat_mul_s_scalar_q,\
> +                        neon_sat_mul_h_scalar_long,\
> +                        neon_sat_mul_s_scalar_long,\
> +                        neon_sat_mla_b_long,\
> +                        neon_sat_mla_h_long,\
> +                        neon_sat_mla_s_long,\
> +                        neon_sat_mla_h_scalar_long,\
> +                        neon_sat_mla_s_scalar_long,\
> +                       "))
> +  "decode2op,nothing*4")
> +
> +;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
> +;; FABD: FP arithmetic op^Q.
> +(define_insn_reservation "fp_abd_diff" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_abd_s,\
> +                        neon_fp_abd_s_q,\
> +                        neon_fp_abd_d,\
> +                        neon_fp_abd_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; See #3565
> +;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
> +(define_insn_reservation "neon_f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_addsub_s,\
> +                        neon_fp_addsub_s_q,\
> +                        neon_fp_addsub_d,\
> +                        neon_fp_addsub_d_q,\
> +                       "))
> +  "decode1op,nothing*5")
> +
> +;; FDIV: FP divide op^Q.
> +(define_insn_reservation "neon_f_div" 28
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_div_s,\
> +                        neon_fp_div_s_q,\
> +                        neon_fp_div_d,\
> +                        neon_fp_div_d_q,\
> +                       "))
> +  "decode1op,fp_divide*27")
> +
> +;; FABS/FNEG: FP move op^Q.
> +(define_insn_reservation "neon_f_neg" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_neg_s,\
> +                        neon_fp_neg_s_q,\
> +                        neon_fp_neg_d,\
> +                        neon_fp_neg_d_q,\
> +                        neon_fp_abs_s,\
> +                        neon_fp_abs_s_q,\
> +                        neon_fp_abs_d,\
> +                        neon_fp_abs_d_q,\
> +                       "))
> +  "decode1op,nothing")
> +
> +;; FRINTN/FRINTM/FRINTA/FRINTP/FRINTZ/FRINTX/FRINTI: FP convert op^Q.
> +(define_insn_reservation "neon_f_round" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_round_s,\
> +                        neon_fp_round_s_q,\
> +                        neon_fp_round_d,\
> +                        neon_fp_round_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; FCVTNS/FCVTMS/FCVTAS/FCVTPS: FP convert op^Q.
> +;; FCVTNU/FCVTMU/FCVTAU/FCVTPU: FP convert op^Q.
> +;; FCVTZS/FCVTZU (integer): FP convert op^Q.
> +;; FCVTN/FCVTL (size=0): FP half cvt op.
> +(define_insn_reservation "neon_f_cvt" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type"  "neon_int_to_fp_s,\
> +                         neon_int_to_fp_s_q,\
> +                         neon_int_to_fp_d,\
> +                         neon_int_to_fp_d_q,\
> +                         neon_fp_cvt_widen_s,\
> +                         neon_fp_cvt_narrow_s_q,\
> +                         neon_fp_cvt_narrow_d_q,\
> +                        "))
> +  "decode1op,nothing*4")
> +
> +;; FADD/FSUB/FMULX/FMLA/FMLS/FADDP: FP arithmetic op^Q.
> +(define_insn_reservation "neon_f_reduc" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_reduc_add_s,\
> +                        neon_fp_reduc_add_s_q,\
> +                        neon_fp_reduc_add_d,\
> +                        neon_fp_reduc_add_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; CLS/CLZ/CNT/NOT/RBIT: ASIMD logical op^Q.
> +;; PMUL: ASIMD logical op^Q.
> +(define_insn_reservation "neon_cls" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_cls,neon_cls_q"))
> +  "decode1op,nothing")
> +
> +;; ST1 (one register, 1D): FP store op.
> +(define_insn_reservation "neon_st1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_one_lane,\
> +                        neon_store1_one_lane_q,\
> +                       "))
> +  "decode1op,nothing*3")
> +
> +;; ADDHN/SUBHN/RADDHN/RSUBHN: ASIMD arithmetic op*2 + ASIMD shift op.
> +;; 3 + 3
> +(define_insn_reservation "neon_halve_narrow" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_sub_halve_narrow_q,\
> +                        neon_add_halve_narrow_q,\
> +                       "))
> +  "decodeIsolated,nothing*5")
> +
> +;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op).
> +;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op).
> +;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
> +;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
> +;; 3 + 3
> +(define_insn_reservation "neon_shift_acc" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_acc,\
> +                        neon_shift_acc_q,\
> +                       "))
> +  "decode2op,nothing*5")
> +
> +;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op.
> +;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op.
> +;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op^Q.
> +;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op^Q.
> +(define_insn_reservation "neon_fp_compare" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_compare_s,\
> +                        neon_fp_compare_s_q,\
> +                        neon_fp_compare_d,\
> +                        neon_fp_compare_d_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +;; FSQRT: FP sqrt op.
> +(define_insn_reservation "neon_fp_sqrt" 38
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_sqrt_s,\
> +                        neon_fp_sqrt_s_q,\
> +                        neon_fp_sqrt_d,\
> +                        neon_fp_sqrt_d_q,\
> +                       "))
> +  "decode1op,fp_divide*37")

Similar concern to the integer divide comment above.

> +
> +;; See #3566
> +;; TBL/TBX (single register table): (ASIMD logical op + ASIMD logical op)^Q.
> +;; 2 + 2
> +(define_insn_reservation "neon_tbl1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl1,\
> +                        neon_tbl1_q,\
> +                       "))
> +  "decode2op,nothing*2")
> +
> +;; TBL/TBX (two register table): (ASIMD logical op + ASIMD logical op + ASIMD logical op + ASIMD logical op)^Q.
> +;; 2 + 2 + 2 + 2
> +(define_insn_reservation "neon_tbl2" 8
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl2,\
> +                        neon_tbl2_q,\
> +                       "))
> +  "decodeIsolated,nothing*7")
> +
> +;; See #3565
> +;; ZIP1/ZIP2/UZP1/UZP2 (Q=0): ASIMD shift op.
> +;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=11): ASIMD logical op*2.
> +;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=other): ASIMD shift op*2.
> +;; TRN1/TRN2 (size=11): ASIMD logical op*2.
> +;; TRN1/TRN2 (size=other): ASIMD shift op^Q.
> +(define_insn_reservation "neon_permute" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_permute,\
> +                        neon_permute_q,\
> +                       "))
> +  "decode2op,nothing*2")
> +
> +;; LD1R: FP load op.
> +(define_insn_reservation "neon_ld1r" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_all_lanes,\
> +                       "))
> +  "decode1op,nothing*9")
> +
> +;; FRECPE/FRECPX: ASIMD dre op.
> +;; FRECPE/FRECPX: ASIMD dre op.
> +(define_insn_reservation "neon_fp_recp" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recpe_s,\
> +                        neon_fp_recpe_s_q,\
> +                        neon_fp_recpe_d,\
> +                        neon_fp_recpe_d_q,\
> +                        neon_fp_recpx_s,\
> +                        neon_fp_recpx_s_q,\
> +                        neon_fp_recpx_d,\
> +                        neon_fp_recpx_d_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +
> +;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
> +;; FRECPS/FRSQRTS: FP arithmetic op^Q.
> +(define_insn_reservation "neon_fp_recp_s" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recps_s,\
> +                        neon_fp_recps_s_q,\
> +                        neon_fp_recps_d,\
> +                        neon_fp_recps_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; See #3566
> +;; PMULL: ASIMD polymul op*2.
> +(define_insn_reservation "neon_pmull" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_d_long,\
> +                       "))
> +  "decode2op,nothing*4")
> --
> 1.9.1
>
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
  2014-11-19 17:36 ` [PATCH 2/2, AArch64] Pipeline model for APM XGene-1 Philipp Tomsich
  2014-11-19 18:08   ` Kyrill Tkachov
@ 2014-11-19 18:11   ` Andrew Pinski
  2014-11-19 18:11   ` Kyrill Tkachov
  2 siblings, 0 replies; 14+ messages in thread
From: Andrew Pinski @ 2014-11-19 18:11 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: GCC Patches, Marcus Shawcroft, benedikt.huber, ksankaran

A few other comments.

On Wed, Nov 19, 2014 at 9:32 AM, Philipp Tomsich
<philipp.tomsich@theobroma-systems.com> wrote:
> ---
>  gcc/ChangeLog                 |   6 +
>  gcc/config/aarch64/aarch64.md |   4 +-
>  gcc/config/arm/xgene1.md      | 739 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 748 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/config/arm/xgene1.md
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 5b389c5..9cc3b5a 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,5 +1,11 @@
>  2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
>
> +       * config/aarch64/aarch64.md: Include xgene1.md.
> +       (generic_sched): Set to no for xgene1.
> +       * config/arm/xgene1.md: New file.
> +
> +2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
> +
>         * config/aarch64/aarch64-cores.def (xgene1): Update/add the
>         xgene1 (APM XGene-1) core definition.
>         * gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 597ff8c..5d92051 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -191,7 +191,7 @@
>
>  (define_attr "generic_sched" "yes,no"
>    (const (if_then_else
> -          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
> +          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
>            (const_string "no")
>            (const_string "yes"))))
>
> @@ -4211,3 +4211,5 @@
>
>  ;; Atomic Operations
>  (include "atomics.md")
> +
> +(include "../arm/xgene1.md")
> diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
> new file mode 100644
> index 0000000..3c08b16
> --- /dev/null
> +++ b/gcc/config/arm/xgene1.md
> @@ -0,0 +1,739 @@
> +;; Machine description for AppliedMicro xgene1 core.
> +;; Copyright (C) 2012 Free Software Foundation, Inc.
> +;; Contributed by Theobroma Systems Design und Consulting GmbH.
> +;;                See http://www.theobroma-systems.com for more info.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; Pipeline description for the xgene1 micro-architecture
> +
> +(define_automaton "xgene1")
> +
> +(define_cpu_unit "decode_out_0" "xgene1")
> +(define_cpu_unit "decode_out_1" "xgene1")
> +(define_cpu_unit "decode_out_2" "xgene1")
> +(define_cpu_unit "decode_out_3" "xgene1")
> +
> +(define_cpu_unit "divide" "xgene1")
> +(define_cpu_unit "fp_divide" "xgene1")

These all should have xgene1 in their names because they are not
generic names and if we decide on a generic scheduler, we might use
more generic names.

Thanks,
Andrew Pinski

> +
> +(define_reservation "decode1op"
> +        "( decode_out_0 )
> +        |( decode_out_1 )
> +        |( decode_out_2 )
> +        |( decode_out_3 )"
> +)
> +(define_reservation "decode2op"
> +        "( decode_out_0 + decode_out_1 )
> +        |( decode_out_0 + decode_out_2 )
> +        |( decode_out_0 + decode_out_3 )
> +        |( decode_out_1 + decode_out_2 )
> +        |( decode_out_1 + decode_out_3 )
> +        |( decode_out_2 + decode_out_3 )"
> +)
> +(define_reservation "decodeIsolated"
> +        "( decode_out_0 + decode_out_1 + decode_out_2 + decode_out_3 )"
> +)
> +
> +;; (define_insn_reservation "dummy" 1
> +;;   (and (eq_attr "tune" "xgene1")
> +;;        (eq_attr "type" "neon_minmax"))
> +;;   "decodeIsolated")
> +
> +;; B: nop.
> +;; BR: branch op.
> +
> +;; RET
> +;; CBZ
> +;; TBZ
> +(define_insn_reservation "branch" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "branch"))
> +  "decode1op")
> +
> +;; NOP
> +;; HINT
> +(define_insn_reservation "nop" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "no_insn"))
> +  "decode1op")
> +
> +;; See #3565
> +;; BLR: arithmetic op & branch op.
> +;; BL: arithmetic op.
> +(define_insn_reservation "call" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "call"))
> +  "decode2op")
> +
> +;; LDR: FP load op & arithmetic op.
> +(define_insn_reservation "f_load" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_loadd,f_loads"))
> +  "decode2op,nothing*9")
> +
> +;; STR: FP store op & arithmetic op.
> +(define_insn_reservation "f_store" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_stored,f_stores"))
> +  "decode2op,nothing*3")
> +
> +;; FMOV (immediate): FP move op.
> +;; FMOV (register): FP move op.
> +(define_insn_reservation "fmov" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fmov,fconsts,fconstd"))
> +  "decode1op,nothing")
> +
> +;; LDP: FP load op & FP load op.
> +;; LDP: FP load op & FP load op & arithmetic op.
> +(define_insn_reservation "f_mcr" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mcr"))
> +  "decodeIsolated,nothing*9")
> +
> +;; STP: FP store op & FP store op.
> +(define_insn_reservation "f_mrc" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mrc"))
> +  "decode2op,nothing*3")
> +
> +;; The (register offset) instructions with a shift
> +;; of #0, #2, or #3 (or no shift) are translated
> +;; as shown.
> +;; For these instructions, any other shift amount
> +;; causes the instruction be prefixed with an
> +;; sbfm/ubfm op (1 cycle latency).
> +
> +;; Load/store register pair (post-indexed):
> +;; LDP: load op & load op & arithmetic op.
> +;; Load/store register pair (offset):
> +;; LDP: load op & load op.
> +;; Load/store register pair (pre-indexed):
> +;; LDP: load op & load op & arithmetic op.
> +;; 5 + 1
> +(define_insn_reservation "load_pair" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load2"))
> +  "decodeIsolated,nothing*5")
> +
> +;; Load/store register pair (post-indexed):
> +;; STP: store op & store op & arithmetic op.
> +;; Load/store register pair (offset):
> +;; STP: store op & store op.
> +;; Load/store register pair (pre-indexed):
> +;; STP: store op & store op & arithmetic op.
> +;; 1 + 1
> +(define_insn_reservation "store_pair" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store2"))
> +  "decodeIsolated,nothing")
> +
> +;; Load register (literal):
> +;; LDR: load op.
> +;; Load/store register (immediate post-indexed):
> +;; LDRB/LDRH/LDR: load op & arithmetic op.
> +;; Load/store register (immediate pre-indexed):
> +;; LDRB/LDRH/LDR: load op & arithmetic op.
> +;; Load/store register (register offset)
> +;; DRB/LDRH/LDR: load op.
> +;; LDRSB/LDRSH/LDRSW: load op + sbfm op (1 cycle latency).
> +;; Load/store register (unsigned immediate):
> +;; LDRB/LDRH/LDR: load op.
> +;; 5 + 1
> +;; FIXME This is inaccurate but avoids a crash.
> +(define_insn_reservation "load1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load1"))
> +  "decode2op,nothing")
> +
> +;; Load/store register (immediate post-indexed):
> +;; STRB/STRH/STR: store op & arithmetic op.
> +;; Load/store register (immediate pre-indexed):
> +;; STRB/STRH/STR: store op & arithmetic op.
> +;; Load/store register (register offset)
> +;; STRB/STRH/STR: store op.
> +;; Load/store register (unsigned immediate):
> +;; STRB/STRH/STR: store op.
> +;; 1 + 1
> +(define_insn_reservation "store1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store1"))
> +  "decode2op,nothing")
> +
> +;; MOVI
> +;; MOV
> +;; Move wide: logical op.
> +;; MRS NZCV: logical op (register result).
> +(define_insn_reservation "move" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mov_reg,mov_imm,mrs"))
> +  "decode1op")
> +
> +;; See #3565
> +(define_insn_reservation "alu" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
> +                        alu_ext,adc_reg,csel,logic_imm,\
> +                        logic_reg,logic_shift_imm,clz,\
> +                        rbit,shift_reg,adr,mov_reg,\
> +                        mov_imm,extend"))
> +  "decode1op")
> +
> +;; REV/REV16/REV32: SIMD op.
> +(define_insn_reservation "simd" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "rev"))
> +  "decode1op")
> +
> +;; See #3565
> +(define_insn_reservation "alus" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
> +                        alus_ext,logics_imm,logics_reg,\
> +                        logics_shift_imm"))
> +  "decode1op")
> +
> +;; MADD/SMADDL/UMADDL with Ra=XZR/WZR: multiply op.
> +;; MADD/SMADDL/UMADDL with other Ra: multiply op + arithmetic op.
> +;; MSUB/SMSUBL/UMSUBL: multiply op + arithmetic op.
> +;; SMULH/UMULH: multiply op.
> +;; 5 + 1
> +(define_insn_reservation "mul" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
> +  "decode2op,nothing*5")
> +
> +;; UDIV/SDIV: divide op.
> +(define_insn_reservation "div" 66
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "sdiv,udiv"))
> +  "decode1op,divide*65")
> +
> +;; FCMP/FCMPE: FP compare op.
> +(define_insn_reservation "fcmp" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcmpd,fcmps"))
> +  "decode1op,nothing*11")
> +
> +;; FCSEL: FP select op
> +(define_insn_reservation "fcsel" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcsel"))
> +  "decode1op,nothing*2")
> +
> +;; See #3565
> +(define_insn_reservation "bfm" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "bfm"))
> +  "decode1op,nothing")
> +
> +;; FRINTN/FRINTP/FRINTM/FRINTZ/FRINTA/FRINTX/FRINTI:
> +;; FP convert op
> +(define_insn_reservation "f_rint" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_rintd,f_rints"))
> +  "decode1op,nothing*4")
> +
> +;; FCVT (single to double or double to single): FP arithmetic op.
> +;; FCVT (to or from half precision): FP half cvt op.
> +(define_insn_reservation "f_cvt" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvt"))
> +  "decode1op,nothing*2")
> +
> +;; Floating-point<->integer conversions:
> +;; FCVTNS/FCVTAS/FCVTPS/FCVTMS: FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; FCVTNU/FCVTAU/FCVTPU/FCVTMU: FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; FCVTZS/FCVTZU (integer): FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; Floating-point<->fixed-point conversions:
> +;; FCVTZS/FCVTZU (fixed-point): FP convert op + FP store op (data bypass path) + (integer) load op.
> +;; 5 + 1 + 5
> +(define_insn_reservation "f_cvtf2i" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvtf2i"))
> +  "decodeIsolated,nothing*10")
> +
> +;; SCVTF/UCVTF (integer): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
> +;; SCVTF/UCVTF (fixed-point): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
> +;; -1 + 10 + 5
> +(define_insn_reservation "f_cvti2f" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvti2f"))
> +  "decodeIsolated,nothing*13")
> +
> +;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
> +(define_insn_reservation "f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
> +  "decode1op,nothing*5")
> +
> +;; FDIV: FP divide op.
> +(define_insn_reservation "f_div" 28
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fdivd,fdivs"))
> +  "decode1op,fp_divide*27")
> +
> +;; FABS/FNEG: FP move op.
> +(define_insn_reservation "f_arith" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "ffarithd,ffariths"))
> +  "decode1op,nothing")
> +
> +;; FSQRT: FP sqrt op.
> +(define_insn_reservation "f_sqrt" 38
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fsqrtd,fsqrts"))
> +  "decode1op,fp_divide*37")
> +
> +;; FMAX/FMIN/FMAXNM/FMINNM: FP select op.
> +(define_insn_reservation "f_select" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_minmaxd,f_minmaxs"))
> +  "decode1op,nothing*2")
> +
> +
> +;; SIMD (aka neon)
> +
> +;; DUP (element) (size=x1000): ASIMD logical op.
> +;; DUP (element) (size=other): ASIMD shift op.
> +(define_insn_reservation "neon_dup" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_dup,neon_dup_q"))
> +  "decode1op,nothing*2")
> +
> +;; LDR: FP load op & arithmetic op.
> +;; LD1 (one register, 1D): FP load op.
> +;; LD1 (one register, 2D): FP load op*2.
> +;; LD1 (one register, 2S/4H/8B): FP complex load op.
> +;; LD1 (one register, 4S/8H/16B): FP complex load op*2.
> +(define_insn_reservation "neon_load1" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
> +  "decode2op,nothing*10")
> +
> +;; STR: FP store op & arithmetic op.
> +;; ST1 (one register, 1D): FP store op
> +;; ST1 (one register, 2D): FP store op*2
> +;; ST1 (one register, 2S/4H/8B): FP complex store op
> +;; ST1 (one register, 4S/8H/16B): FP complex store op*2
> +;; 4 + 1
> +(define_insn_reservation "neon_store1" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
> +  "decode2op,nothing*4")
> +
> +;; MOVI/MVNI/ORR/BIC/FMOV: ASIMD logical op^Q.
> +;; AND/BIC/ORR/ORN/EOR/BSL/BIT/BIF: ASIMD logical op^Q.
> +(define_insn_reservation "neon_logic" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_logic,\
> +                        neon_logic_q,\
> +                        neon_bsl,\
> +                        neon_bsl_q,\
> +                        neon_move,\
> +                        neon_move_q,\
> +                       "))
> +  "decode1op,nothing")
> +;; N.B. ^Q means that it only uses one decode slot.
> +
> +;; UMOV (imm5=xxx00): FP store op (data bypass path) + (integer) load op.
> +;; UMOV (imm5=other): FP store op (data bypass path) + (integer) load op + ubfm op (1 cycle latency).
> +;; 1 + 5 + 1
> +(define_insn_reservation "neon_umov" 7
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
> +  "decodeIsolated,nothing*6")
> +
> +;; INS (element) (imm5=01000): FP move op.
> +;; INS (element) (imm5=other): ASIMD shift op + ASIMD insert op.
> +;; INS (general register) (imm5=01000): (integer) store op + FP load op.
> +;; INS (general register) (imm5=other): (integer) store op + FP load op + ASIMD insert op.
> +;; 1 + 10 + 3
> +(define_insn_reservation "neon_ins" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_from_gp,\
> +                        neon_from_gp_q,\
> +                        neon_ins,\
> +                        neon_ins_q,\
> +                       "))
> +  "decodeIsolated,nothing*13")
> +
> +;; USHR/URSHR: ASIMD shift op.
> +;; USHR/URSHR: ASIMD shift op^Q.
> +;; SSHR/SRSHR: ASIMD shift op.
> +;; SSHR/SRSHR: ASIMD shift op^Q.
> +;; SHL/SQSHL/SQSHLU: ASIMD shift op.
> +;; SHL/SQSHL/SQSHLU: ASIMD shift op^Q.
> +;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op.
> +;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op^Q.
> +;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op.
> +;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op^Q.
> +;; XTN/SQXTN/UQXTN/SQXTUN/SHLL: ASIMD shift op.
> +;; SSHLL/USHLL: ASIMD shift op*2.
> +(define_insn_reservation "neon_shift" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_imm,\
> +                        neon_shift_imm_q,\
> +                        neon_shift_reg,\
> +                        neon_shift_reg_q,\
> +                        neon_shift_imm_long,\
> +                        neon_sat_shift_imm,\
> +                        neon_sat_shift_imm_q,\
> +                        neon_sat_shift_imm_narrow_q,\
> +                        neon_sat_shift_reg,\
> +                        neon_sat_shift_reg_q,\
> +                        neon_shift_imm_narrow_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +;; ADD/SUB: ASIMD arithmetic op.
> +;; ADD/SUB/ADDP: ASIMD arithmetic op^Q.
> +;; SMAX/SMIN/SABD/SMAXP/SMINP: ASIMD arithmetic op^Q.
> +;; UMAX/UMIN/UABD/UMAXP/UMINP: ASIMD arithmetic op^Q.
> +;; USQADD/ABS/NEG: ASIMD arithmetic op.
> +;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
> +;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
> +;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
> +
> +;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
> +;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
> +;; SADDLP/SUQADD/SQABS/SQNEG: ASIMD arithmetic op^Q.
> +;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
> +
> +;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op.
> +;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op.
> +;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op^Q.
> +;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op^Q.
> +(define_insn_reservation "neon_arith" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_add,\
> +                        neon_add_q,\
> +                        neon_sub,\
> +                        neon_sub_q,\
> +                        neon_neg,\
> +                        neon_neg_q,\
> +                        neon_abs,\
> +                        neon_abs_q,\
> +                        neon_abd_q,\
> +                        neon_arith_acc,\
> +                        neon_arith_acc_q,\
> +                        neon_reduc_add,\
> +                        neon_reduc_add_q,\
> +                        neon_add_halve,\
> +                        neon_add_halve_q,\
> +                        neon_sub_halve,\
> +                        neon_sub_halve_q,\
> +                        neon_qadd,\
> +                        neon_qadd_q,\
> +                        neon_compare,\
> +                        neon_compare_q,\
> +                        neon_compare_zero,\
> +                        neon_compare_zero_q,\
> +                        neon_tst,\
> +                        neon_tst_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +;; SABA/UABA: (ASIMD arithmetic op + ASIMD arithmetic op)^Q.
> +;; 3*3
> +(define_insn_reservation "neon_abs_diff" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
> +  "decode2op,nothing*5")
> +
> +;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
> +;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
> +;; SMLAL/SMLSL/SMULL/SQDMLAL/SQDMLSL/SQDMULL: ASIMD multiply op*2.
> +;; SMULL/SMLAL/SMLSL (by element): ASIMD multiply op*2.
> +;; UMULL/UMLAL/UMLSL (by element): ASIMD multiply op*2.
> +;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op.
> +;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op^Q.
> +
> +;; SQDMULH/SQRDMULH: ASIMD multiply op.
> +;; SQDMULH/SQRDMULH (by element): ASIMD multiply op.
> +;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
> +;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
> +
> +;; SQDMULL/SQDMLAL/SQDMLSL (by element): ASIMD multiply op*2.
> +;; SQDMULL/SQDMLAL/SQDMLSL: ASIMD multiply op.
> +(define_insn_reservation "neon_mul" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_b,\
> +                        neon_mul_b_q,\
> +                        neon_mul_h,\
> +                        neon_mul_h_q,\
> +                        neon_mul_s,\
> +                        neon_mul_s_q,\
> +                        neon_fp_mul_s_scalar,\
> +                        neon_fp_mul_s_scalar_q,\
> +                        neon_fp_mul_d_scalar_q,\
> +                        neon_mla_b,neon_mla_b_q,\
> +                        neon_mla_h,neon_mla_h_q,\
> +                        neon_mla_s,neon_mla_s_q,\
> +                        neon_mla_h_scalar,\
> +                        neon_mla_h_scalar_q,\
> +                        neon_mla_s_scalar,\
> +                        neon_mla_s_scalar_q,\
> +                        neon_mla_b_long,\
> +                        neon_mla_h_long,\
> +                        neon_mla_s_long,\
> +                        neon_fp_mul_s,\
> +                        neon_fp_mul_s_q,\
> +                        neon_fp_mul_d,\
> +                        neon_fp_mul_d_q,\
> +                        neon_fp_mla_s,\
> +                        neon_fp_mla_s_q,\
> +                        neon_fp_mla_d,\
> +                        neon_fp_mla_d_q,\
> +                        neon_fp_mla_s_scalar,\
> +                        neon_fp_mla_s_scalar_q,\
> +                        neon_fp_mla_d_scalar_q,\
> +                        neon_sat_mul_b,\
> +                        neon_sat_mul_b_q,\
> +                        neon_sat_mul_h,\
> +                        neon_sat_mul_h_q,\
> +                        neon_sat_mul_s,\
> +                        neon_sat_mul_s_q,\
> +                        neon_sat_mul_h_scalar,\
> +                        neon_sat_mul_h_scalar_q,\
> +                        neon_sat_mul_s_scalar,\
> +                        neon_sat_mul_s_scalar_q,\
> +                        neon_sat_mul_h_scalar_long,\
> +                        neon_sat_mul_s_scalar_long,\
> +                        neon_sat_mla_b_long,\
> +                        neon_sat_mla_h_long,\
> +                        neon_sat_mla_s_long,\
> +                        neon_sat_mla_h_scalar_long,\
> +                        neon_sat_mla_s_scalar_long,\
> +                       "))
> +  "decode2op,nothing*4")
> +
> +;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
> +;; FABD: FP arithmetic op^Q.
> +(define_insn_reservation "fp_abd_diff" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_abd_s,\
> +                        neon_fp_abd_s_q,\
> +                        neon_fp_abd_d,\
> +                        neon_fp_abd_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; See #3565
> +;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
> +(define_insn_reservation "neon_f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_addsub_s,\
> +                        neon_fp_addsub_s_q,\
> +                        neon_fp_addsub_d,\
> +                        neon_fp_addsub_d_q,\
> +                       "))
> +  "decode1op,nothing*5")
> +
> +;; FDIV: FP divide op^Q.
> +(define_insn_reservation "neon_f_div" 28
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_div_s,\
> +                        neon_fp_div_s_q,\
> +                        neon_fp_div_d,\
> +                        neon_fp_div_d_q,\
> +                       "))
> +  "decode1op,fp_divide*27")
> +
> +;; FABS/FNEG: FP move op^Q.
> +(define_insn_reservation "neon_f_neg" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_neg_s,\
> +                        neon_fp_neg_s_q,\
> +                        neon_fp_neg_d,\
> +                        neon_fp_neg_d_q,\
> +                        neon_fp_abs_s,\
> +                        neon_fp_abs_s_q,\
> +                        neon_fp_abs_d,\
> +                        neon_fp_abs_d_q,\
> +                       "))
> +  "decode1op,nothing")
> +
> +;; FRINTN/FRINTM/FRINTA/FRINTP/FRINTZ/FRINTX/FRINTI: FP convert op^Q.
> +(define_insn_reservation "neon_f_round" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_round_s,\
> +                        neon_fp_round_s_q,\
> +                        neon_fp_round_d,\
> +                        neon_fp_round_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; FCVTNS/FCVTMS/FCVTAS/FCVTPS: FP convert op^Q.
> +;; FCVTNU/FCVTMU/FCVTAU/FCVTPU: FP convert op^Q.
> +;; FCVTZS/FCVTZU (integer): FP convert op^Q.
> +;; FCVTN/FCVTL (size=0): FP half cvt op.
> +(define_insn_reservation "neon_f_cvt" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type"  "neon_int_to_fp_s,\
> +                         neon_int_to_fp_s_q,\
> +                         neon_int_to_fp_d,\
> +                         neon_int_to_fp_d_q,\
> +                         neon_fp_cvt_widen_s,\
> +                         neon_fp_cvt_narrow_s_q,\
> +                         neon_fp_cvt_narrow_d_q,\
> +                        "))
> +  "decode1op,nothing*4")
> +
> +;; FADD/FSUB/FMULX/FMLA/FMLS/FADDP: FP arithmetic op^Q.
> +(define_insn_reservation "neon_f_reduc" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_reduc_add_s,\
> +                        neon_fp_reduc_add_s_q,\
> +                        neon_fp_reduc_add_d,\
> +                        neon_fp_reduc_add_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; CLS/CLZ/CNT/NOT/RBIT: ASIMD logical op^Q.
> +;; PMUL: ASIMD logical op^Q.
> +(define_insn_reservation "neon_cls" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_cls,neon_cls_q"))
> +  "decode1op,nothing")
> +
> +;; ST1 (one register, 1D): FP store op.
> +(define_insn_reservation "neon_st1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_one_lane,\
> +                        neon_store1_one_lane_q,\
> +                       "))
> +  "decode1op,nothing*3")
> +
> +;; ADDHN/SUBHN/RADDHN/RSUBHN: ASIMD arithmetic op*2 + ASIMD shift op.
> +;; 3 + 3
> +(define_insn_reservation "neon_halve_narrow" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_sub_halve_narrow_q,\
> +                        neon_add_halve_narrow_q,\
> +                       "))
> +  "decodeIsolated,nothing*5")
> +
> +;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op).
> +;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op).
> +;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
> +;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
> +;; 3 + 3
> +(define_insn_reservation "neon_shift_acc" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_acc,\
> +                        neon_shift_acc_q,\
> +                       "))
> +  "decode2op,nothing*5")
> +
> +;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op.
> +;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op.
> +;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op^Q.
> +;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op^Q.
> +(define_insn_reservation "neon_fp_compare" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_compare_s,\
> +                        neon_fp_compare_s_q,\
> +                        neon_fp_compare_d,\
> +                        neon_fp_compare_d_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +;; FSQRT: FP sqrt op.
> +(define_insn_reservation "neon_fp_sqrt" 38
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_sqrt_s,\
> +                        neon_fp_sqrt_s_q,\
> +                        neon_fp_sqrt_d,\
> +                        neon_fp_sqrt_d_q,\
> +                       "))
> +  "decode1op,fp_divide*37")
> +
> +;; See #3566
> +;; TBL/TBX (single register table): (ASIMD logical op + ASIMD logical op)^Q.
> +;; 2 + 2
> +(define_insn_reservation "neon_tbl1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl1,\
> +                        neon_tbl1_q,\
> +                       "))
> +  "decode2op,nothing*2")
> +
> +;; TBL/TBX (two register table): (ASIMD logical op + ASIMD logical op + ASIMD logical op + ASIMD logical op)^Q.
> +;; 2 + 2 + 2 + 2
> +(define_insn_reservation "neon_tbl2" 8
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl2,\
> +                        neon_tbl2_q,\
> +                       "))
> +  "decodeIsolated,nothing*7")
> +
> +;; See #3565
> +;; ZIP1/ZIP2/UZP1/UZP2 (Q=0): ASIMD shift op.
> +;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=11): ASIMD logical op*2.
> +;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=other): ASIMD shift op*2.
> +;; TRN1/TRN2 (size=11): ASIMD logical op*2.
> +;; TRN1/TRN2 (size=other): ASIMD shift op^Q.
> +(define_insn_reservation "neon_permute" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_permute,\
> +                        neon_permute_q,\
> +                       "))
> +  "decode2op,nothing*2")
> +
> +;; LD1R: FP load op.
> +(define_insn_reservation "neon_ld1r" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_all_lanes,\
> +                       "))
> +  "decode1op,nothing*9")
> +
> +;; FRECPE/FRECPX: ASIMD dre op.
> +;; FRECPE/FRECPX: ASIMD dre op.
> +(define_insn_reservation "neon_fp_recp" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recpe_s,\
> +                        neon_fp_recpe_s_q,\
> +                        neon_fp_recpe_d,\
> +                        neon_fp_recpe_d_q,\
> +                        neon_fp_recpx_s,\
> +                        neon_fp_recpx_s_q,\
> +                        neon_fp_recpx_d,\
> +                        neon_fp_recpx_d_q,\
> +                       "))
> +  "decode1op,nothing*2")
> +
> +
> +;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
> +;; FRECPS/FRSQRTS: FP arithmetic op^Q.
> +(define_insn_reservation "neon_fp_recp_s" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recps_s,\
> +                        neon_fp_recps_s_q,\
> +                        neon_fp_recps_d,\
> +                        neon_fp_recps_d_q,\
> +                       "))
> +  "decode1op,nothing*4")
> +
> +;; See #3566
> +;; PMULL: ASIMD polymul op*2.
> +(define_insn_reservation "neon_pmull" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_d_long,\
> +                       "))
> +  "decode2op,nothing*4")
> --
> 1.9.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
  2014-11-19 17:36 ` [PATCH 2/2, AArch64] Pipeline model for APM XGene-1 Philipp Tomsich
  2014-11-19 18:08   ` Kyrill Tkachov
  2014-11-19 18:11   ` Andrew Pinski
@ 2014-11-19 18:11   ` Kyrill Tkachov
  2014-11-19 19:45     ` Dr. Philipp Tomsich
  2014-11-19 21:58     ` [PATCH 2/2, AArch64, v2] " Philipp Tomsich
  2 siblings, 2 replies; 14+ messages in thread
From: Kyrill Tkachov @ 2014-11-19 18:11 UTC (permalink / raw)
  To: Philipp Tomsich, gcc-patches; +Cc: marcus.shawcroft, benedikt.huber, ksankaran

Hi Philipp,

One more comment...

On 19/11/14 17:32, Philipp Tomsich wrote:
> @@ -4211,3 +4211,5 @@
>
>   ;; Atomic Operations
>   (include "atomics.md")
> +
> +(include "../arm/xgene1.md")

Do you expect to add arm support for this core? If so, you should wire 
it up there as well.
If not, I think Andrew set a precedent for aarch64-only cores 
descriptions going in the config/aarch64/ directory

Cheers,
Kyrill

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
  2014-11-19 18:11   ` Kyrill Tkachov
@ 2014-11-19 19:45     ` Dr. Philipp Tomsich
  2014-11-19 21:58     ` [PATCH 2/2, AArch64, v2] " Philipp Tomsich
  1 sibling, 0 replies; 14+ messages in thread
From: Dr. Philipp Tomsich @ 2014-11-19 19:45 UTC (permalink / raw)
  To: Kyrill Tkachov
  Cc: gcc-patches, marcus.shawcroft, benedikt.huber, Kumar Sankaran

Kyrill,

The XGene-cores all support AArch32.
We plan to wire this up, but I’d like to merge this as-is (i.e. without wiring it up for AArch32) initially, as we haven’t done the same amount of QA on these with AArch32 as with AArch64.

Do you have any issue with this going into the config/arm-directory, even though it’s not wired up there yet? 
If you want me to move it over to the config/aarch64 directory for now, I’d have to move it back to config/arm later on…

Cheers,
—Philipp.


> On 19 Nov 2014, at 19:06, Kyrill Tkachov <kyrylo.tkachov@arm.com> wrote:
> 
> Hi Philipp,
> 
> One more comment...
> 
> On 19/11/14 17:32, Philipp Tomsich wrote:
>> @@ -4211,3 +4211,5 @@
>> 
>>  ;; Atomic Operations
>>  (include "atomics.md")
>> +
>> +(include "../arm/xgene1.md")
> 
> Do you expect to add arm support for this core? If so, you should wire it up there as well.
> If not, I think Andrew set a precedent for aarch64-only cores descriptions going in the config/aarch64/ directory
> 
> Cheers,
> Kyrill
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2, AArch64, v2] Pipeline model for APM XGene-1.
  2014-11-19 18:11   ` Kyrill Tkachov
  2014-11-19 19:45     ` Dr. Philipp Tomsich
@ 2014-11-19 21:58     ` Philipp Tomsich
  2014-11-20  9:38       ` Kyrill Tkachov
  2014-11-20 11:25       ` Ramana Radhakrishnan
  1 sibling, 2 replies; 14+ messages in thread
From: Philipp Tomsich @ 2014-11-19 21:58 UTC (permalink / raw)
  To: gcc-patches, kyrylo.tkachov
  Cc: benedikt.huber, pinskia, marcus.shawcroft, ksankaran, Philipp Tomsich

Here's an updated patch with Kyrill's and Andrew's comments integrated.

I left the file in the config/arm-directory, as XGene-family is capable of 
executing ARMv7 and we will wire this into the 32bit backend in the near
future (moving it now would just cause another move in the near future).

We also moved the 'include' up to where the pipeline models for the 
A53/A57/ThunderX are included, as the previous dependency on picking up the
SIMD types from aarch64-simd.md no longer holds true since gcc-4.9.

Cheers,
-Philipp.


---
 gcc/ChangeLog                 |   6 +
 gcc/config/aarch64/aarch64.md |   3 +-
 gcc/config/arm/xgene1.md      | 520 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 528 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/arm/xgene1.md

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c9ac0d9..dad2278 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,11 @@
 2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
 
+	* config/aarch64/aarch64.md: Include xgene1.md.
+	(generic_sched): Set to no for xgene1.
+	* config/arm/xgene1.md: New file.
+
+2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
 	* config/aarch64/aarch64-cores.def (xgene1): Update/add the
 	xgene1 (APM XGene-1) core definition.
 	* gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..1b36384 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -191,7 +191,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
+          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
           (const_string "no")
           (const_string "yes"))))
 
@@ -199,6 +199,7 @@
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a15.md")
 (include "thunderx.md")
+(include "../arm/xgene1.md")
 
 ;; -------------------------------------------------------------------
 ;; Jumps and other miscellaneous insns
diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
new file mode 100644
index 0000000..227f2c7
--- /dev/null
+++ b/gcc/config/arm/xgene1.md
@@ -0,0 +1,520 @@
+;; Machine description for AppliedMicro xgene1 core.
+;; Copyright (C) 2012-2014 Free Software Foundation, Inc.
+;; Contributed by Theobroma Systems Design und Consulting GmbH.
+;;                See http://www.theobroma-systems.com for more info.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Pipeline description for the xgene1 micro-architecture
+
+(define_automaton "xgene1")
+
+(define_cpu_unit "xgene1_decode_out0" "xgene1")
+(define_cpu_unit "xgene1_decode_out1" "xgene1")
+(define_cpu_unit "xgene1_decode_out2" "xgene1")
+(define_cpu_unit "xgene1_decode_out3" "xgene1")
+
+(define_cpu_unit "xgene_divide" "xgene1")
+(define_cpu_unit "xgene_fp_divide" "xgene1")
+
+(define_reservation "xgene1_decode1op"
+        "( xgene1_decode_out0 )
+        |( xgene1_decode_out1 )
+        |( xgene1_decode_out2 )
+        |( xgene1_decode_out3 )"
+)
+(define_reservation "xgene1_decode2op"
+        "( xgene1_decode_out0 + xgene1_decode_out1 )
+        |( xgene1_decode_out0 + xgene1_decode_out2 )
+        |( xgene1_decode_out0 + xgene1_decode_out3 )
+        |( xgene1_decode_out1 + xgene1_decode_out2 )
+        |( xgene1_decode_out1 + xgene1_decode_out3 )
+        |( xgene1_decode_out2 + xgene1_decode_out3 )"
+)
+(define_reservation "xgene1_decodeIsolated"
+        "( xgene1_decode_out0 + xgene1_decode_out1 + xgene1_decode_out2 + xgene1_decode_out3 )"
+)
+
+(define_insn_reservation "branch" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "branch"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "nop" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "no_insn"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "call" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "call"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "f_load" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_loadd,f_loads"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "f_store" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_stored,f_stores"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "fmov" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fmov,fconsts,fconstd"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "f_mcr" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_mcr"))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "f_mrc" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_mrc"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "load_pair" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "load2"))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "store_pair" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "store2"))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "load1" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "load1"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "store1" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "store1"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "move" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "mov_reg,mov_imm,mrs"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "alu" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
+                        alu_ext,adc_reg,csel,logic_imm,\
+                        logic_reg,logic_shift_imm,clz,\
+                        rbit,shift_reg,adr,mov_reg,\
+                        mov_imm,extend"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "simd" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "rev"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "alus" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
+                        alus_ext,logics_imm,logics_reg,\
+                        logics_shift_imm"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "mul" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "div" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "sdiv,udiv"))
+  "xgene1_decode1op,xgene_divide")
+
+(define_insn_reservation "fcmp" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fcmpd,fcmps"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "fcsel" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fcsel"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "bfm" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "bfm"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "f_rint" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_rintd,f_rints"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "f_cvt" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvt"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "f_cvtf2i" 11
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvtf2i"))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "f_cvti2f" 14
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvti2f"))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "f_add" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "f_div" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fdivd,fdivs"))
+  "xgene1_decode1op,xgene_fp_divide")
+
+(define_insn_reservation "f_arith" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "ffarithd,ffariths"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "f_sqrt" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fsqrtd,fsqrts"))
+  "xgene1_decode1op,xgene_fp_divide")
+
+(define_insn_reservation "f_select" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_minmaxd,f_minmaxs"))
+  "xgene1_decode1op")
+
+
+(define_insn_reservation "neon_dup" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_dup,neon_dup_q"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_load1" 11
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "neon_store1" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "neon_logic" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_logic,\
+                        neon_logic_q,\
+                        neon_bsl,\
+                        neon_bsl_q,\
+                        neon_move,\
+                        neon_move_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_umov" 7
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "neon_ins" 14
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_from_gp,\
+                        neon_from_gp_q,\
+                        neon_ins,\
+                        neon_ins_q,\
+                       "))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "neon_shift" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_shift_imm,\
+                        neon_shift_imm_q,\
+                        neon_shift_reg,\
+                        neon_shift_reg_q,\
+                        neon_shift_imm_long,\
+                        neon_sat_shift_imm,\
+                        neon_sat_shift_imm_q,\
+                        neon_sat_shift_imm_narrow_q,\
+                        neon_sat_shift_reg,\
+                        neon_sat_shift_reg_q,\
+                        neon_shift_imm_narrow_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_arith" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_add,\
+                        neon_add_q,\
+                        neon_sub,\
+                        neon_sub_q,\
+                        neon_neg,\
+                        neon_neg_q,\
+                        neon_abs,\
+                        neon_abs_q,\
+                        neon_abd_q,\
+                        neon_arith_acc,\
+                        neon_arith_acc_q,\
+                        neon_reduc_add,\
+                        neon_reduc_add_q,\
+                        neon_add_halve,\
+                        neon_add_halve_q,\
+                        neon_sub_halve,\
+                        neon_sub_halve_q,\
+                        neon_qadd,\
+                        neon_qadd_q,\
+                        neon_compare,\
+                        neon_compare_q,\
+                        neon_compare_zero,\
+                        neon_compare_zero_q,\
+                        neon_tst,\
+                        neon_tst_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_abs_diff" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
+  "xgene1_decode2op")
+
+(define_insn_reservation "neon_mul" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_mul_b,\
+                        neon_mul_b_q,\
+                        neon_mul_h,\
+                        neon_mul_h_q,\
+                        neon_mul_s,\
+                        neon_mul_s_q,\
+                        neon_fp_mul_s_scalar,\
+                        neon_fp_mul_s_scalar_q,\
+                        neon_fp_mul_d_scalar_q,\
+                        neon_mla_b,neon_mla_b_q,\
+                        neon_mla_h,neon_mla_h_q,\
+                        neon_mla_s,neon_mla_s_q,\
+                        neon_mla_h_scalar,\
+                        neon_mla_h_scalar_q,\
+                        neon_mla_s_scalar,\
+                        neon_mla_s_scalar_q,\
+                        neon_mla_b_long,\
+                        neon_mla_h_long,\
+                        neon_mla_s_long,\
+                        neon_fp_mul_s,\
+                        neon_fp_mul_s_q,\
+                        neon_fp_mul_d,\
+                        neon_fp_mul_d_q,\
+                        neon_fp_mla_s,\
+                        neon_fp_mla_s_q,\
+                        neon_fp_mla_d,\
+                        neon_fp_mla_d_q,\
+                        neon_fp_mla_s_scalar,\
+                        neon_fp_mla_s_scalar_q,\
+                        neon_fp_mla_d_scalar_q,\
+                        neon_sat_mul_b,\
+                        neon_sat_mul_b_q,\
+                        neon_sat_mul_h,\
+                        neon_sat_mul_h_q,\
+                        neon_sat_mul_s,\
+                        neon_sat_mul_s_q,\
+                        neon_sat_mul_h_scalar,\
+                        neon_sat_mul_h_scalar_q,\
+                        neon_sat_mul_s_scalar,\
+                        neon_sat_mul_s_scalar_q,\
+                        neon_sat_mul_h_scalar_long,\
+                        neon_sat_mul_s_scalar_long,\
+                        neon_sat_mla_b_long,\
+                        neon_sat_mla_h_long,\
+                        neon_sat_mla_s_long,\
+                        neon_sat_mla_h_scalar_long,\
+                        neon_sat_mla_s_scalar_long,\
+                       "))
+  "xgene1_decode2op")
+
+(define_insn_reservation "fp_abd_diff" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_abd_s,\
+                        neon_fp_abd_s_q,\
+                        neon_fp_abd_d,\
+                        neon_fp_abd_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_f_add" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_addsub_s,\
+                        neon_fp_addsub_s_q,\
+                        neon_fp_addsub_d,\
+                        neon_fp_addsub_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_f_div" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_div_s,\
+                        neon_fp_div_s_q,\
+                        neon_fp_div_d,\
+                        neon_fp_div_d_q,\
+                       "))
+  "xgene1_decode1op,xgene_fp_divide")
+
+(define_insn_reservation "neon_f_neg" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_neg_s,\
+                        neon_fp_neg_s_q,\
+                        neon_fp_neg_d,\
+                        neon_fp_neg_d_q,\
+                        neon_fp_abs_s,\
+                        neon_fp_abs_s_q,\
+                        neon_fp_abs_d,\
+                        neon_fp_abs_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_f_round" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_round_s,\
+                        neon_fp_round_s_q,\
+                        neon_fp_round_d,\
+                        neon_fp_round_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_f_cvt" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type"  "neon_int_to_fp_s,\
+                         neon_int_to_fp_s_q,\
+                         neon_int_to_fp_d,\
+                         neon_int_to_fp_d_q,\
+                         neon_fp_cvt_widen_s,\
+                         neon_fp_cvt_narrow_s_q,\
+                         neon_fp_cvt_narrow_d_q,\
+                        "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_f_reduc" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_reduc_add_s,\
+                        neon_fp_reduc_add_s_q,\
+                        neon_fp_reduc_add_d,\
+                        neon_fp_reduc_add_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_cls" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_cls,neon_cls_q"))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_st1" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_store1_one_lane,\
+                        neon_store1_one_lane_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_halve_narrow" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_sub_halve_narrow_q,\
+                        neon_add_halve_narrow_q,\
+                       "))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "neon_shift_acc" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_shift_acc,\
+                        neon_shift_acc_q,\
+                       "))
+  "xgene1_decode2op")
+
+(define_insn_reservation "neon_fp_compare" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_compare_s,\
+                        neon_fp_compare_s_q,\
+                        neon_fp_compare_d,\
+                        neon_fp_compare_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_fp_sqrt" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_sqrt_s,\
+                        neon_fp_sqrt_s_q,\
+                        neon_fp_sqrt_d,\
+                        neon_fp_sqrt_d_q,\
+                       "))
+  "xgene1_decode1op,xgene_fp_divide")
+
+(define_insn_reservation "neon_tbl1" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_tbl1,\
+                        neon_tbl1_q,\
+                       "))
+  "xgene1_decode2op")
+
+(define_insn_reservation "neon_tbl2" 8
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_tbl2,\
+                        neon_tbl2_q,\
+                       "))
+  "xgene1_decodeIsolated")
+
+(define_insn_reservation "neon_permute" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_permute,\
+                        neon_permute_q,\
+                       "))
+  "xgene1_decode2op")
+
+(define_insn_reservation "neon_ld1r" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_load1_all_lanes,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_fp_recp" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_recpe_s,\
+                        neon_fp_recpe_s_q,\
+                        neon_fp_recpe_d,\
+                        neon_fp_recpe_d_q,\
+                        neon_fp_recpx_s,\
+                        neon_fp_recpx_s_q,\
+                        neon_fp_recpx_d,\
+                        neon_fp_recpx_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+
+(define_insn_reservation "neon_fp_recp_s" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_recps_s,\
+                        neon_fp_recps_s_q,\
+                        neon_fp_recps_d,\
+                        neon_fp_recps_d_q,\
+                       "))
+  "xgene1_decode1op")
+
+(define_insn_reservation "neon_pmull" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_mul_d_long,\
+                       "))
+  "xgene1_decode2op")
-- 
1.9.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64, v2] Pipeline model for APM XGene-1.
  2014-11-19 21:58     ` [PATCH 2/2, AArch64, v2] " Philipp Tomsich
@ 2014-11-20  9:38       ` Kyrill Tkachov
  2014-11-20 11:01         ` Dr. Philipp Tomsich
  2014-11-20 11:25       ` Ramana Radhakrishnan
  1 sibling, 1 reply; 14+ messages in thread
From: Kyrill Tkachov @ 2014-11-20  9:38 UTC (permalink / raw)
  To: Philipp Tomsich, gcc-patches
  Cc: benedikt.huber, pinskia, marcus.shawcroft, ksankaran

Hi Philipp,

I don't mind it being in config/arm if you plan to wire it up later, 
good to know.
Another comment inline....

Thanks,
Kyrill

On 19/11/14 21:42, Philipp Tomsich wrote:
> Here's an updated patch with Kyrill's and Andrew's comments integrated.
>
> I left the file in the config/arm-directory, as XGene-family is capable of
> executing ARMv7 and we will wire this into the 32bit backend in the near
> future (moving it now would just cause another move in the near future).
>
> We also moved the 'include' up to where the pipeline models for the
> A53/A57/ThunderX are included, as the previous dependency on picking up the
> SIMD types from aarch64-simd.md no longer holds true since gcc-4.9.
>
> Cheers,
> -Philipp.
>
>
> ---
>   gcc/ChangeLog                 |   6 +
>   gcc/config/aarch64/aarch64.md |   3 +-
>   gcc/config/arm/xgene1.md      | 520 ++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 528 insertions(+), 1 deletion(-)
>   create mode 100644 gcc/config/arm/xgene1.md
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c9ac0d9..dad2278 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,5 +1,11 @@
>   2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
>
> +       * config/aarch64/aarch64.md: Include xgene1.md.
> +       (generic_sched): Set to no for xgene1.
> +       * config/arm/xgene1.md: New file.
> +
> +2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
> +
>          * config/aarch64/aarch64-cores.def (xgene1): Update/add the
>          xgene1 (APM XGene-1) core definition.
>          * gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 597ff8c..1b36384 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -191,7 +191,7 @@
>
>   (define_attr "generic_sched" "yes,no"
>     (const (if_then_else
> -          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
> +          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
>             (const_string "no")
>             (const_string "yes"))))
>
> @@ -199,6 +199,7 @@
>   (include "../arm/cortex-a53.md")
>   (include "../arm/cortex-a15.md")
>   (include "thunderx.md")
> +(include "../arm/xgene1.md")
>
>   ;; -------------------------------------------------------------------
>   ;; Jumps and other miscellaneous insns
> diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
> new file mode 100644
> index 0000000..227f2c7
> --- /dev/null
> +++ b/gcc/config/arm/xgene1.md
> @@ -0,0 +1,520 @@
> +;; Machine description for AppliedMicro xgene1 core.
> +;; Copyright (C) 2012-2014 Free Software Foundation, Inc.
> +;; Contributed by Theobroma Systems Design und Consulting GmbH.
> +;;                See http://www.theobroma-systems.com for more info.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; Pipeline description for the xgene1 micro-architecture
> +
> +(define_automaton "xgene1")
> +
> +(define_cpu_unit "xgene1_decode_out0" "xgene1")
> +(define_cpu_unit "xgene1_decode_out1" "xgene1")
> +(define_cpu_unit "xgene1_decode_out2" "xgene1")
> +(define_cpu_unit "xgene1_decode_out3" "xgene1")
> +
> +(define_cpu_unit "xgene_divide" "xgene1")
> +(define_cpu_unit "xgene_fp_divide" "xgene1")

Why is this xgene_* while the other units xgene1_*?

> +
> +(define_reservation "xgene1_decode1op"
> +        "( xgene1_decode_out0 )
> +        |( xgene1_decode_out1 )
> +        |( xgene1_decode_out2 )
> +        |( xgene1_decode_out3 )"
> +)
> +(define_reservation "xgene1_decode2op"
> +        "( xgene1_decode_out0 + xgene1_decode_out1 )
> +        |( xgene1_decode_out0 + xgene1_decode_out2 )
> +        |( xgene1_decode_out0 + xgene1_decode_out3 )
> +        |( xgene1_decode_out1 + xgene1_decode_out2 )
> +        |( xgene1_decode_out1 + xgene1_decode_out3 )
> +        |( xgene1_decode_out2 + xgene1_decode_out3 )"
> +)
> +(define_reservation "xgene1_decodeIsolated"
> +        "( xgene1_decode_out0 + xgene1_decode_out1 + xgene1_decode_out2 + xgene1_decode_out3 )"
> +)
> +
> +(define_insn_reservation "branch" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "branch"))
> +  "xgene1_decode1op")

insn_reservation names should also have the xgene1_* namespace

> +
> +(define_insn_reservation "nop" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "no_insn"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "call" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "call"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "f_load" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_loadd,f_loads"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "f_store" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_stored,f_stores"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "fmov" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fmov,fconsts,fconstd"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_mcr" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mcr"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "f_mrc" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mrc"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "load_pair" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load2"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "store_pair" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store2"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "load1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load1"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "store1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store1"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "move" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mov_reg,mov_imm,mrs"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "alu" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
> +                        alu_ext,adc_reg,csel,logic_imm,\
> +                        logic_reg,logic_shift_imm,clz,\
> +                        rbit,shift_reg,adr,mov_reg,\
> +                        mov_imm,extend"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "simd" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "rev"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "alus" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
> +                        alus_ext,logics_imm,logics_reg,\
> +                        logics_shift_imm"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "mul" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "div" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "sdiv,udiv"))
> +  "xgene1_decode1op,xgene_divide")

The dangerous part was the reservation duration (the xgene_divide*<large 
number>).
The latency number (2 in this version, 66 in the previous) is not 
harmful to the automaton size
and can be as high as needed (if this operation is high latency)....

> +
> +(define_insn_reservation "fcmp" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcmpd,fcmps"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "fcsel" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcsel"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "bfm" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "bfm"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_rint" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_rintd,f_rints"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_cvt" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvt"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_cvtf2i" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvtf2i"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "f_cvti2f" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvti2f"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_div" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fdivd,fdivs"))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "f_arith" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "ffarithd,ffariths"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_sqrt" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fsqrtd,fsqrts"))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "f_select" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_minmaxd,f_minmaxs"))
> +  "xgene1_decode1op")
> +
> +
> +(define_insn_reservation "neon_dup" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_dup,neon_dup_q"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_load1" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_store1" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_logic" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_logic,\
> +                        neon_logic_q,\
> +                        neon_bsl,\
> +                        neon_bsl_q,\
> +                        neon_move,\
> +                        neon_move_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_umov" 7
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_ins" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_from_gp,\
> +                        neon_from_gp_q,\
> +                        neon_ins,\
> +                        neon_ins_q,\
> +                       "))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_shift" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_imm,\
> +                        neon_shift_imm_q,\
> +                        neon_shift_reg,\
> +                        neon_shift_reg_q,\
> +                        neon_shift_imm_long,\
> +                        neon_sat_shift_imm,\
> +                        neon_sat_shift_imm_q,\
> +                        neon_sat_shift_imm_narrow_q,\
> +                        neon_sat_shift_reg,\
> +                        neon_sat_shift_reg_q,\
> +                        neon_shift_imm_narrow_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_arith" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_add,\
> +                        neon_add_q,\
> +                        neon_sub,\
> +                        neon_sub_q,\
> +                        neon_neg,\
> +                        neon_neg_q,\
> +                        neon_abs,\
> +                        neon_abs_q,\
> +                        neon_abd_q,\
> +                        neon_arith_acc,\
> +                        neon_arith_acc_q,\
> +                        neon_reduc_add,\
> +                        neon_reduc_add_q,\
> +                        neon_add_halve,\
> +                        neon_add_halve_q,\
> +                        neon_sub_halve,\
> +                        neon_sub_halve_q,\
> +                        neon_qadd,\
> +                        neon_qadd_q,\
> +                        neon_compare,\
> +                        neon_compare_q,\
> +                        neon_compare_zero,\
> +                        neon_compare_zero_q,\
> +                        neon_tst,\
> +                        neon_tst_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_abs_diff" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_mul" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_b,\
> +                        neon_mul_b_q,\
> +                        neon_mul_h,\
> +                        neon_mul_h_q,\
> +                        neon_mul_s,\
> +                        neon_mul_s_q,\
> +                        neon_fp_mul_s_scalar,\
> +                        neon_fp_mul_s_scalar_q,\
> +                        neon_fp_mul_d_scalar_q,\
> +                        neon_mla_b,neon_mla_b_q,\
> +                        neon_mla_h,neon_mla_h_q,\
> +                        neon_mla_s,neon_mla_s_q,\
> +                        neon_mla_h_scalar,\
> +                        neon_mla_h_scalar_q,\
> +                        neon_mla_s_scalar,\
> +                        neon_mla_s_scalar_q,\
> +                        neon_mla_b_long,\
> +                        neon_mla_h_long,\
> +                        neon_mla_s_long,\
> +                        neon_fp_mul_s,\
> +                        neon_fp_mul_s_q,\
> +                        neon_fp_mul_d,\
> +                        neon_fp_mul_d_q,\
> +                        neon_fp_mla_s,\
> +                        neon_fp_mla_s_q,\
> +                        neon_fp_mla_d,\
> +                        neon_fp_mla_d_q,\
> +                        neon_fp_mla_s_scalar,\
> +                        neon_fp_mla_s_scalar_q,\
> +                        neon_fp_mla_d_scalar_q,\
> +                        neon_sat_mul_b,\
> +                        neon_sat_mul_b_q,\
> +                        neon_sat_mul_h,\
> +                        neon_sat_mul_h_q,\
> +                        neon_sat_mul_s,\
> +                        neon_sat_mul_s_q,\
> +                        neon_sat_mul_h_scalar,\
> +                        neon_sat_mul_h_scalar_q,\
> +                        neon_sat_mul_s_scalar,\
> +                        neon_sat_mul_s_scalar_q,\
> +                        neon_sat_mul_h_scalar_long,\
> +                        neon_sat_mul_s_scalar_long,\
> +                        neon_sat_mla_b_long,\
> +                        neon_sat_mla_h_long,\
> +                        neon_sat_mla_s_long,\
> +                        neon_sat_mla_h_scalar_long,\
> +                        neon_sat_mla_s_scalar_long,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "fp_abd_diff" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_abd_s,\
> +                        neon_fp_abd_s_q,\
> +                        neon_fp_abd_d,\
> +                        neon_fp_abd_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_addsub_s,\
> +                        neon_fp_addsub_s_q,\
> +                        neon_fp_addsub_d,\
> +                        neon_fp_addsub_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_div" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_div_s,\
> +                        neon_fp_div_s_q,\
> +                        neon_fp_div_d,\
> +                        neon_fp_div_d_q,\
> +                       "))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "neon_f_neg" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_neg_s,\
> +                        neon_fp_neg_s_q,\
> +                        neon_fp_neg_d,\
> +                        neon_fp_neg_d_q,\
> +                        neon_fp_abs_s,\
> +                        neon_fp_abs_s_q,\
> +                        neon_fp_abs_d,\
> +                        neon_fp_abs_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_round" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_round_s,\
> +                        neon_fp_round_s_q,\
> +                        neon_fp_round_d,\
> +                        neon_fp_round_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_cvt" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type"  "neon_int_to_fp_s,\
> +                         neon_int_to_fp_s_q,\
> +                         neon_int_to_fp_d,\
> +                         neon_int_to_fp_d_q,\
> +                         neon_fp_cvt_widen_s,\
> +                         neon_fp_cvt_narrow_s_q,\
> +                         neon_fp_cvt_narrow_d_q,\
> +                        "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_reduc" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_reduc_add_s,\
> +                        neon_fp_reduc_add_s_q,\
> +                        neon_fp_reduc_add_d,\
> +                        neon_fp_reduc_add_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_cls" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_cls,neon_cls_q"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_st1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_one_lane,\
> +                        neon_store1_one_lane_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_halve_narrow" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_sub_halve_narrow_q,\
> +                        neon_add_halve_narrow_q,\
> +                       "))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_shift_acc" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_acc,\
> +                        neon_shift_acc_q,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_fp_compare" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_compare_s,\
> +                        neon_fp_compare_s_q,\
> +                        neon_fp_compare_d,\
> +                        neon_fp_compare_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_fp_sqrt" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_sqrt_s,\
> +                        neon_fp_sqrt_s_q,\
> +                        neon_fp_sqrt_d,\
> +                        neon_fp_sqrt_d_q,\
> +                       "))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "neon_tbl1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl1,\
> +                        neon_tbl1_q,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_tbl2" 8
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl2,\
> +                        neon_tbl2_q,\
> +                       "))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_permute" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_permute,\
> +                        neon_permute_q,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_ld1r" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_all_lanes,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_fp_recp" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recpe_s,\
> +                        neon_fp_recpe_s_q,\
> +                        neon_fp_recpe_d,\
> +                        neon_fp_recpe_d_q,\
> +                        neon_fp_recpx_s,\
> +                        neon_fp_recpx_s_q,\
> +                        neon_fp_recpx_d,\
> +                        neon_fp_recpx_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +
> +(define_insn_reservation "neon_fp_recp_s" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recps_s,\
> +                        neon_fp_recps_s_q,\
> +                        neon_fp_recps_d,\
> +                        neon_fp_recps_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_pmull" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_d_long,\
> +                       "))
> +  "xgene1_decode2op")
> --
> 1.9.1
>
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64, v2] Pipeline model for APM XGene-1.
  2014-11-20  9:38       ` Kyrill Tkachov
@ 2014-11-20 11:01         ` Dr. Philipp Tomsich
  2014-11-20 11:01           ` Kyrill Tkachov
  0 siblings, 1 reply; 14+ messages in thread
From: Dr. Philipp Tomsich @ 2014-11-20 11:01 UTC (permalink / raw)
  To: Kyrill Tkachov
  Cc: gcc-patches, benedikt.huber, pinskia, marcus.shawcroft, Kumar Sankaran

Kyrill,

> I don't mind it being in config/arm if you plan to wire it up later, good to know.
> Another comment inline….

I’ll clean up the missing xgene1_ and the mistyped xgene_ prefix and resubmit.

>> +(define_insn_reservation "div" 2
>> +  (and (eq_attr "tune" "xgene1")
>> +       (eq_attr "type" "sdiv,udiv"))
>> +  "xgene1_decode1op,xgene_divide")
> 
> The dangerous part was the reservation duration (the xgene_divide*<large number>).
> The latency number (2 in this version, 66 in the previous) is not harmful to the automaton size
> and can be as high as needed (if this operation is high latency)....

It doesn’t really matter for any workload we’ve encountered, as the hardware is better at dealing with ‘div’-latencies than the scheduler (especially, as ‘div’ is variable latency and any guess we have will be wrong… we’ll likely add scheduling hook function in the future).
The more important thing is to keep the cost of divides high enough in the cost-model.

In other words: 66 would be the worst case and will normally not be correct anyway. Furthermore, it’s rather unplausible, that we find 264 instructions (for this worst-case scenario) to fill the scheduling bubble between the div-insn and its result usage.

Best,
Philipp.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64, v2] Pipeline model for APM XGene-1.
  2014-11-20 11:01         ` Dr. Philipp Tomsich
@ 2014-11-20 11:01           ` Kyrill Tkachov
  0 siblings, 0 replies; 14+ messages in thread
From: Kyrill Tkachov @ 2014-11-20 11:01 UTC (permalink / raw)
  To: Dr. Philipp Tomsich
  Cc: gcc-patches, benedikt.huber, pinskia, marcus.shawcroft, ksankaran

Hi Philipp,

On 20/11/14 10:47, Dr. Philipp Tomsich wrote:
> Kyrill,
>
>> I don't mind it being in config/arm if you plan to wire it up later, good to know.
>> Another comment inline….
> I’ll clean up the missing xgene1_ and the mistyped xgene_ prefix and resubmit.
>
>>> +(define_insn_reservation "div" 2
>>> +  (and (eq_attr "tune" "xgene1")
>>> +       (eq_attr "type" "sdiv,udiv"))
>>> +  "xgene1_decode1op,xgene_divide")
>> The dangerous part was the reservation duration (the xgene_divide*<large number>).
>> The latency number (2 in this version, 66 in the previous) is not harmful to the automaton size
>> and can be as high as needed (if this operation is high latency)....
> It doesn’t really matter for any workload we’ve encountered, as the hardware is better at dealing with ‘div’-latencies than the scheduler (especially, as ‘div’ is variable latency and any guess we have will be wrong… we’ll likely add scheduling hook function in the future).
> The more important thing is to keep the cost of divides high enough in the cost-model.
>
> In other words: 66 would be the worst case and will normally not be correct anyway. Furthermore, it’s rather unplausible, that we find 264 instructions (for this worst-case scenario) to fill the scheduling bubble between the div-insn and its result usage.

Ok, makes sense. I just thought that 2 is a bit too low but if your 
benchmarking showed it to be reasonable I won't complain ;)

Kyrill

>
> Best,
> Philipp.
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2, AArch64, v2] Pipeline model for APM XGene-1.
  2014-11-19 21:58     ` [PATCH 2/2, AArch64, v2] " Philipp Tomsich
  2014-11-20  9:38       ` Kyrill Tkachov
@ 2014-11-20 11:25       ` Ramana Radhakrishnan
  1 sibling, 0 replies; 14+ messages in thread
From: Ramana Radhakrishnan @ 2014-11-20 11:25 UTC (permalink / raw)
  To: Philipp Tomsich
  Cc: gcc-patches, Kyrylo Tkachov, benedikt.huber, Andrew Pinski,
	Marcus Shawcroft, ksankaran

On Wed, Nov 19, 2014 at 9:42 PM, Philipp Tomsich
<philipp.tomsich@theobroma-systems.com> wrote:
> Here's an updated patch with Kyrill's and Andrew's comments integrated.
>
> I left the file in the config/arm-directory, as XGene-family is capable of
> executing ARMv7 and we will wire this into the 32bit backend in the near
> future (moving it now would just cause another move in the near future).
>

Right, if this were making it into the arm backend and if the core
indeed does have AArch32 support, I'd like to see support for the
command line for xgene1 in the AArch32 backend as well for 5.0. Do
have a look in arm-cores.def in gcc/config/arm - there are ways of
using existing tuning options with the command line or putting this as
part of "generic". We've been here before and users typically complain
about CPU option X being available in AArch32 state but not in AArch64
state. Since this is a separate tuning option, I'm less worried about
this going in later in stage3 but realistically it would be good to
have the command line options wired up for AArch32 by the end of the
year.

Ramana


> We also moved the 'include' up to where the pipeline models for the
> A53/A57/ThunderX are included, as the previous dependency on picking up the
> SIMD types from aarch64-simd.md no longer holds true since gcc-4.9.
>
> Cheers,
> -Philipp.
>
>
> ---
>  gcc/ChangeLog                 |   6 +
>  gcc/config/aarch64/aarch64.md |   3 +-
>  gcc/config/arm/xgene1.md      | 520 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 528 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/config/arm/xgene1.md
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c9ac0d9..dad2278 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,5 +1,11 @@
>  2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
>
> +       * config/aarch64/aarch64.md: Include xgene1.md.
> +       (generic_sched): Set to no for xgene1.
> +       * config/arm/xgene1.md: New file.
> +
> +2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
> +
>         * config/aarch64/aarch64-cores.def (xgene1): Update/add the
>         xgene1 (APM XGene-1) core definition.
>         * gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 597ff8c..1b36384 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -191,7 +191,7 @@
>
>  (define_attr "generic_sched" "yes,no"
>    (const (if_then_else
> -          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
> +          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
>            (const_string "no")
>            (const_string "yes"))))
>
> @@ -199,6 +199,7 @@
>  (include "../arm/cortex-a53.md")
>  (include "../arm/cortex-a15.md")
>  (include "thunderx.md")
> +(include "../arm/xgene1.md")
>
>  ;; -------------------------------------------------------------------
>  ;; Jumps and other miscellaneous insns
> diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
> new file mode 100644
> index 0000000..227f2c7
> --- /dev/null
> +++ b/gcc/config/arm/xgene1.md
> @@ -0,0 +1,520 @@
> +;; Machine description for AppliedMicro xgene1 core.
> +;; Copyright (C) 2012-2014 Free Software Foundation, Inc.
> +;; Contributed by Theobroma Systems Design und Consulting GmbH.
> +;;                See http://www.theobroma-systems.com for more info.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; Pipeline description for the xgene1 micro-architecture
> +
> +(define_automaton "xgene1")
> +
> +(define_cpu_unit "xgene1_decode_out0" "xgene1")
> +(define_cpu_unit "xgene1_decode_out1" "xgene1")
> +(define_cpu_unit "xgene1_decode_out2" "xgene1")
> +(define_cpu_unit "xgene1_decode_out3" "xgene1")
> +
> +(define_cpu_unit "xgene_divide" "xgene1")
> +(define_cpu_unit "xgene_fp_divide" "xgene1")
> +
> +(define_reservation "xgene1_decode1op"
> +        "( xgene1_decode_out0 )
> +        |( xgene1_decode_out1 )
> +        |( xgene1_decode_out2 )
> +        |( xgene1_decode_out3 )"
> +)
> +(define_reservation "xgene1_decode2op"
> +        "( xgene1_decode_out0 + xgene1_decode_out1 )
> +        |( xgene1_decode_out0 + xgene1_decode_out2 )
> +        |( xgene1_decode_out0 + xgene1_decode_out3 )
> +        |( xgene1_decode_out1 + xgene1_decode_out2 )
> +        |( xgene1_decode_out1 + xgene1_decode_out3 )
> +        |( xgene1_decode_out2 + xgene1_decode_out3 )"
> +)
> +(define_reservation "xgene1_decodeIsolated"
> +        "( xgene1_decode_out0 + xgene1_decode_out1 + xgene1_decode_out2 + xgene1_decode_out3 )"
> +)
> +
> +(define_insn_reservation "branch" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "branch"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "nop" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "no_insn"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "call" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "call"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "f_load" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_loadd,f_loads"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "f_store" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_stored,f_stores"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "fmov" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fmov,fconsts,fconstd"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_mcr" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mcr"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "f_mrc" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_mrc"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "load_pair" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load2"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "store_pair" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store2"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "load1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "load1"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "store1" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "store1"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "move" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mov_reg,mov_imm,mrs"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "alu" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
> +                        alu_ext,adc_reg,csel,logic_imm,\
> +                        logic_reg,logic_shift_imm,clz,\
> +                        rbit,shift_reg,adr,mov_reg,\
> +                        mov_imm,extend"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "simd" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "rev"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "alus" 1
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
> +                        alus_ext,logics_imm,logics_reg,\
> +                        logics_shift_imm"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "mul" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "div" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "sdiv,udiv"))
> +  "xgene1_decode1op,xgene_divide")
> +
> +(define_insn_reservation "fcmp" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcmpd,fcmps"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "fcsel" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fcsel"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "bfm" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "bfm"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_rint" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_rintd,f_rints"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_cvt" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvt"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_cvtf2i" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvtf2i"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "f_cvti2f" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_cvti2f"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_div" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fdivd,fdivs"))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "f_arith" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "ffarithd,ffariths"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "f_sqrt" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "fsqrtd,fsqrts"))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "f_select" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "f_minmaxd,f_minmaxs"))
> +  "xgene1_decode1op")
> +
> +
> +(define_insn_reservation "neon_dup" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_dup,neon_dup_q"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_load1" 11
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_store1" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_logic" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_logic,\
> +                        neon_logic_q,\
> +                        neon_bsl,\
> +                        neon_bsl_q,\
> +                        neon_move,\
> +                        neon_move_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_umov" 7
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_ins" 14
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_from_gp,\
> +                        neon_from_gp_q,\
> +                        neon_ins,\
> +                        neon_ins_q,\
> +                       "))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_shift" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_imm,\
> +                        neon_shift_imm_q,\
> +                        neon_shift_reg,\
> +                        neon_shift_reg_q,\
> +                        neon_shift_imm_long,\
> +                        neon_sat_shift_imm,\
> +                        neon_sat_shift_imm_q,\
> +                        neon_sat_shift_imm_narrow_q,\
> +                        neon_sat_shift_reg,\
> +                        neon_sat_shift_reg_q,\
> +                        neon_shift_imm_narrow_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_arith" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_add,\
> +                        neon_add_q,\
> +                        neon_sub,\
> +                        neon_sub_q,\
> +                        neon_neg,\
> +                        neon_neg_q,\
> +                        neon_abs,\
> +                        neon_abs_q,\
> +                        neon_abd_q,\
> +                        neon_arith_acc,\
> +                        neon_arith_acc_q,\
> +                        neon_reduc_add,\
> +                        neon_reduc_add_q,\
> +                        neon_add_halve,\
> +                        neon_add_halve_q,\
> +                        neon_sub_halve,\
> +                        neon_sub_halve_q,\
> +                        neon_qadd,\
> +                        neon_qadd_q,\
> +                        neon_compare,\
> +                        neon_compare_q,\
> +                        neon_compare_zero,\
> +                        neon_compare_zero_q,\
> +                        neon_tst,\
> +                        neon_tst_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_abs_diff" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_mul" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_b,\
> +                        neon_mul_b_q,\
> +                        neon_mul_h,\
> +                        neon_mul_h_q,\
> +                        neon_mul_s,\
> +                        neon_mul_s_q,\
> +                        neon_fp_mul_s_scalar,\
> +                        neon_fp_mul_s_scalar_q,\
> +                        neon_fp_mul_d_scalar_q,\
> +                        neon_mla_b,neon_mla_b_q,\
> +                        neon_mla_h,neon_mla_h_q,\
> +                        neon_mla_s,neon_mla_s_q,\
> +                        neon_mla_h_scalar,\
> +                        neon_mla_h_scalar_q,\
> +                        neon_mla_s_scalar,\
> +                        neon_mla_s_scalar_q,\
> +                        neon_mla_b_long,\
> +                        neon_mla_h_long,\
> +                        neon_mla_s_long,\
> +                        neon_fp_mul_s,\
> +                        neon_fp_mul_s_q,\
> +                        neon_fp_mul_d,\
> +                        neon_fp_mul_d_q,\
> +                        neon_fp_mla_s,\
> +                        neon_fp_mla_s_q,\
> +                        neon_fp_mla_d,\
> +                        neon_fp_mla_d_q,\
> +                        neon_fp_mla_s_scalar,\
> +                        neon_fp_mla_s_scalar_q,\
> +                        neon_fp_mla_d_scalar_q,\
> +                        neon_sat_mul_b,\
> +                        neon_sat_mul_b_q,\
> +                        neon_sat_mul_h,\
> +                        neon_sat_mul_h_q,\
> +                        neon_sat_mul_s,\
> +                        neon_sat_mul_s_q,\
> +                        neon_sat_mul_h_scalar,\
> +                        neon_sat_mul_h_scalar_q,\
> +                        neon_sat_mul_s_scalar,\
> +                        neon_sat_mul_s_scalar_q,\
> +                        neon_sat_mul_h_scalar_long,\
> +                        neon_sat_mul_s_scalar_long,\
> +                        neon_sat_mla_b_long,\
> +                        neon_sat_mla_h_long,\
> +                        neon_sat_mla_s_long,\
> +                        neon_sat_mla_h_scalar_long,\
> +                        neon_sat_mla_s_scalar_long,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "fp_abd_diff" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_abd_s,\
> +                        neon_fp_abd_s_q,\
> +                        neon_fp_abd_d,\
> +                        neon_fp_abd_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_add" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_addsub_s,\
> +                        neon_fp_addsub_s_q,\
> +                        neon_fp_addsub_d,\
> +                        neon_fp_addsub_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_div" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_div_s,\
> +                        neon_fp_div_s_q,\
> +                        neon_fp_div_d,\
> +                        neon_fp_div_d_q,\
> +                       "))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "neon_f_neg" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_neg_s,\
> +                        neon_fp_neg_s_q,\
> +                        neon_fp_neg_d,\
> +                        neon_fp_neg_d_q,\
> +                        neon_fp_abs_s,\
> +                        neon_fp_abs_s_q,\
> +                        neon_fp_abs_d,\
> +                        neon_fp_abs_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_round" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_round_s,\
> +                        neon_fp_round_s_q,\
> +                        neon_fp_round_d,\
> +                        neon_fp_round_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_cvt" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type"  "neon_int_to_fp_s,\
> +                         neon_int_to_fp_s_q,\
> +                         neon_int_to_fp_d,\
> +                         neon_int_to_fp_d_q,\
> +                         neon_fp_cvt_widen_s,\
> +                         neon_fp_cvt_narrow_s_q,\
> +                         neon_fp_cvt_narrow_d_q,\
> +                        "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_f_reduc" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_reduc_add_s,\
> +                        neon_fp_reduc_add_s_q,\
> +                        neon_fp_reduc_add_d,\
> +                        neon_fp_reduc_add_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_cls" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_cls,neon_cls_q"))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_st1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_store1_one_lane,\
> +                        neon_store1_one_lane_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_halve_narrow" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_sub_halve_narrow_q,\
> +                        neon_add_halve_narrow_q,\
> +                       "))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_shift_acc" 6
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_shift_acc,\
> +                        neon_shift_acc_q,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_fp_compare" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_compare_s,\
> +                        neon_fp_compare_s_q,\
> +                        neon_fp_compare_d,\
> +                        neon_fp_compare_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_fp_sqrt" 2
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_sqrt_s,\
> +                        neon_fp_sqrt_s_q,\
> +                        neon_fp_sqrt_d,\
> +                        neon_fp_sqrt_d_q,\
> +                       "))
> +  "xgene1_decode1op,xgene_fp_divide")
> +
> +(define_insn_reservation "neon_tbl1" 4
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl1,\
> +                        neon_tbl1_q,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_tbl2" 8
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_tbl2,\
> +                        neon_tbl2_q,\
> +                       "))
> +  "xgene1_decodeIsolated")
> +
> +(define_insn_reservation "neon_permute" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_permute,\
> +                        neon_permute_q,\
> +                       "))
> +  "xgene1_decode2op")
> +
> +(define_insn_reservation "neon_ld1r" 10
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_load1_all_lanes,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_fp_recp" 3
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recpe_s,\
> +                        neon_fp_recpe_s_q,\
> +                        neon_fp_recpe_d,\
> +                        neon_fp_recpe_d_q,\
> +                        neon_fp_recpx_s,\
> +                        neon_fp_recpx_s_q,\
> +                        neon_fp_recpx_d,\
> +                        neon_fp_recpx_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +
> +(define_insn_reservation "neon_fp_recp_s" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_fp_recps_s,\
> +                        neon_fp_recps_s_q,\
> +                        neon_fp_recps_d,\
> +                        neon_fp_recps_d_q,\
> +                       "))
> +  "xgene1_decode1op")
> +
> +(define_insn_reservation "neon_pmull" 5
> +  (and (eq_attr "tune" "xgene1")
> +       (eq_attr "type" "neon_mul_d_long,\
> +                       "))
> +  "xgene1_decode2op")
> --
> 1.9.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-11-20 11:18 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-19 17:34 [PATCH 0/2, AArch64] APM X-Gene 1 cost-table and pipeline model Philipp Tomsich
2014-11-19 17:33 ` [PATCH 1/2, AArch64] Core definition for APM XGene-1 and associated cost-table Philipp Tomsich
2014-11-19 18:00   ` Kyrill Tkachov
2014-11-19 18:02     ` [PATCH 1/2, AArch64, v2] " Philipp Tomsich
2014-11-19 17:36 ` [PATCH 2/2, AArch64] Pipeline model for APM XGene-1 Philipp Tomsich
2014-11-19 18:08   ` Kyrill Tkachov
2014-11-19 18:11   ` Andrew Pinski
2014-11-19 18:11   ` Kyrill Tkachov
2014-11-19 19:45     ` Dr. Philipp Tomsich
2014-11-19 21:58     ` [PATCH 2/2, AArch64, v2] " Philipp Tomsich
2014-11-20  9:38       ` Kyrill Tkachov
2014-11-20 11:01         ` Dr. Philipp Tomsich
2014-11-20 11:01           ` Kyrill Tkachov
2014-11-20 11:25       ` Ramana Radhakrishnan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).