public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [AArch64] Add scheduling and cost models for Exynos M1
@ 2015-10-27 23:38 Evandro Menezes
  2015-10-28 10:40 ` James Greenhalgh
                   ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-10-27 23:38 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov'

[-- Attachment #1: Type: text/plain, Size: 259 bytes --]

This patch adds the scheduling and cost models for Exynos M1.

Though it’s a rather large patch, much of it is the DFA model for the
pipeline.  Still, I’d appreciate any feedback.

Please, commit if it’s alright.

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0001-AArch64-Add-scheduling-and-cost-models-for-Exynos-M1.patch --]
[-- Type: application/octet-stream, Size: 48525 bytes --]

From 78919fa9e3439df140487187084142da6d0b432f Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 27 Oct 2015 16:45:17 -0500
Subject: [PATCH] [AArch64] Add scheduling and cost models for Exynos M1

2015-10-25  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched and cost
	models.
	* config/aarch64/aarch64.c (aarch64_case_values_threshold): New function.
	(exynosm1_addrcost_table): New variable.
	(exynosm1_regmove_cost): Likewise.
	(exynosm1_vector_cost): Likewise.
	(exynosm1_tunings): Likewise.
	(aarch64_override_options_internal): Tune heuristics specifically
	for Exynos M1.
	(TARGET_CASE_VALUES_THRESHOLD): Define macro.
	* config/arm/aarch-cost-tables.h (exynosm1_extra_costs): New variable.
	* config/arm/arm.c (arm_exynos_m1_tune): Likewise.
	* config/arm/arm-cores.def: Use the Exynos M1 sched and cost models.
	* config/arm/exynos-m1.md: New file.
	* config/arm/arm.md: Include new file.
---
 gcc/config/aarch64/aarch64-cores.def |   2 +-
 gcc/config/aarch64/aarch64.c         |  98 ++++
 gcc/config/arm/aarch-cost-tables.h   | 103 ++++
 gcc/config/arm/arm-cores.def         |   2 +-
 gcc/config/arm/arm.c                 |  23 +
 gcc/config/arm/arm.md                |   3 +-
 gcc/config/arm/exynos-m1.md          | 968 +++++++++++++++++++++++++++++++++++
 7 files changed, 1196 insertions(+), 3 deletions(-)
 create mode 100644 gcc/config/arm/exynos-m1.md

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 0ab1ca8..66be417 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1, "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4965041..1d13c61 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -138,6 +138,7 @@ static bool aarch64_vector_mode_supported_p (machine_mode);
 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 						 const unsigned char *sel);
 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
+static unsigned int aarch64_case_values_threshold (void);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -215,6 +216,22 @@ static const struct cpu_addrcost_table cortexa57_addrcost_table =
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table exynosm1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  1, /* register_offset  */
+  1, /* register_sextend  */
+  2, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
 static const struct cpu_addrcost_table xgene1_addrcost_table =
 {
     {
@@ -261,6 +278,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost exynosm1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost (actual, 4 and 9).  */
+  9, /* GP2FP  */
+  9, /* FP2GP  */
+  1 /* FP2FP  */
+};
+
 static const struct cpu_regmove_cost thunderx_regmove_cost =
 {
   2, /* GP2GP  */
@@ -313,6 +340,22 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   1 /* cond_not_taken_branch_cost  */
 };
 
+static const struct cpu_vector_cost exynosm1_vector_cost =
+{
+  1, /* scalar_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* vec_stmt_cost  */
+  3, /* vec_to_scalar_cost  */
+  3, /* scalar_to_vec_cost  */
+  5, /* vec_align_load_cost  */
+  5, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1 /* cond_not_taken_branch_cost  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost xgene1_vector_cost =
 {
@@ -428,6 +471,28 @@ static const struct tune_params cortexa72_tunings =
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
+static const struct tune_params exynosm1_tunings =
+{
+  &exynosm1_extra_costs,
+  &exynosm1_addrcost_table,
+  &exynosm1_regmove_cost,
+  &exynosm1_vector_cost,
+  &generic_branch_cost,
+  4,	/* memmov_cost  */
+  3,	/* issue_rate  */
+  (AARCH64_FUSE_NOTHING), /* fusible_ops  */
+  4,	/* function_align.  */
+  4,	/* jump_align.  */
+  4,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
+};
+
 static const struct tune_params thunderx_tunings =
 {
   &thunderx_extra_costs,
@@ -7672,6 +7737,22 @@ aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Adjust the heuristics for Exynos M1.  */
+  if (selected_cpu->sched_core == exynosm1)
+    {
+      /* Increase the maximum peeling limit.  */
+      maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
+                             400,
+                             opts->x_param_values,
+			     global_options_set.x_param_values);
+
+      /* Set the L1 cache line size.  */
+      maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+                             64,
+                             opts->x_param_values,
+			     global_options_set.x_param_values);
+    }
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13382,6 +13463,20 @@ aarch64_promoted_type (const_tree t)
     return float_type_node;
   return NULL_TREE;
 }
+
+/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+
+static unsigned int
+aarch64_case_values_threshold (void)
+{
+  /* For Exynos M1, raise the bar for using jump tables.  */
+  if (selected_cpu->sched_core == exynosm1
+      && optimize > 2)
+    return 48;
+  else
+    return default_case_values_threshold ();
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -13432,6 +13527,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
 
+#undef TARGET_CASE_VALUES_THRESHOLD
+#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
+
 /* Only the least significant bit is used for initialization guard
    variables.  */
 #undef TARGET_CXX_GUARD_MASK_BIT
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 66e09a8..850bde0 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -331,6 +331,109 @@ const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table exynosm1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (0), /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (3),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (35)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (21),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (34),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (0)  /* alu.  */
+  }
+};
+
 const struct cpu_cost_table xgene1_extra_costs =
 {
   /* ALU */
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 4c35200..18936f0 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index a598c84..9cd1ea2 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1991,6 +1991,29 @@ const struct tune_params arm_cortex_a57_tune =
   tune_params::SCHED_AUTOPREF_FULL
 };
 
+const struct tune_params arm_exynos_m1_tune =
+{
+  arm_9e_rtx_costs,
+  &exynosm1_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  3,						/* Issue rate.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  tune_params::FUSE_NOTHING,
+  tune_params::SCHED_AUTOPREF_OFF
+};
+
 const struct tune_params arm_xgene1_tune =
 {
   arm_9e_rtx_costs,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 02e147e..e6f07e9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -377,7 +377,7 @@
                                 arm1136jfs,cortexa5,cortexa7,cortexa8,\
                                 cortexa9,cortexa12,cortexa15,cortexa17,\
                                 cortexa53,cortexa57,cortexm4,cortexm7,\
-				marvell_pj4,xgene1")
+				exynosm1,marvell_pj4,xgene1")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -416,6 +416,7 @@
 (include "cortex-m7.md")
 (include "cortex-m4.md")
 (include "cortex-m4-fpu.md")
+(include "exynos-m1.md")
 (include "vfp11.md")
 (include "marvell-pj4.md")
 (include "xgene1.md")
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
new file mode 100644
index 0000000..05011e4
--- /dev/null
+++ b/gcc/config/arm/exynos-m1.md
@@ -0,0 +1,968 @@
+;; Samsung Exynos M1 pipeline description
+;; Copyright (C) 2014-2015 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "exynos_m1")
+
+(define_attr "exynos_m1_type"
+  "mla, mlal,
+   unknown"
+  (cond [
+	  (eq_attr "type" "mla, mlas, smlad, smladx,\
+			   smlawx, smlawy, smlaxy,\
+			   smlsd, smlsdx")
+	    (const_string "mla")
+
+	  (eq_attr "type" "smlal, smlals, smlald,\
+			   smlalxy, smlsld,\
+			   umaal, umlal, umlals")
+	    (const_string "mlal")]
+
+	  (const_string "unknown")))
+
+(define_attr "exynos_m1_neon_type"
+  "neon_arith_simple, neon_arith_basic, neon_arith_complex,
+   neon_multiply, neon_mla, neon_mla_q, neon_mla_long, neon_sat_mla_long,
+   neon_shift_acc, neon_shift_imm_basic, neon_shift_imm_complex,
+   neon_shift_reg_basic, neon_shift_reg_basic_q,
+   neon_shift_reg_complex, neon_shift_reg_complex_q,
+   neon_fp_unary, neon_fp_add, neon_fp_abd, neon_fp_compare,
+   neon_fp_reduc_minmax, neon_fp_reduc_add, neon_fp_round, neon_fp_cvt,
+   neon_fp_minmax, neon_fp_mul, neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q,
+   neon_fp_estimate, neon_fp_estimatex, neon_fp_step,
+   neon_bitops, neon_bitops_q, neon_bitins,
+   neon_to_gp, neon_from_gp, neon_move, neon_tbl,
+   neon_load1_1, neon_load1_2, neon_load1_3, neon_load1_4,
+   neon_load1_one, neon_load1_all,
+   neon_load2_2, neon_load2_one, neon_load2_all,
+   neon_load3_3, neon_load3_one, neon_load3_all,
+   neon_load4_4, neon_load4_one, neon_load4_all,
+   neon_store,
+   neon_store1_1, neon_store1_2, neon_store1_3, neon_store1_4, neon_store1_one,
+   neon_store2_2, neon_store2_one,
+   neon_store3_3, neon_store3_one,
+   neon_store4_4, neon_store4_one,
+   unknown"
+  (cond [
+	  (eq_attr "type" "neon_abd, neon_abd_q, neon_abd_long,\
+			   neon_abs, neon_abs_q,\
+			   neon_minmax, neon_minmax_q")
+	    (const_string "neon_arith_simple")
+
+	  (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
+			   neon_neg, neon_neg_q,\
+			   neon_sub, neon_sub_q, neon_sub_long, neon_sub_widen,\
+			   neon_logic, neon_logic_q, neon_tst, neon_tst_q,\
+			   neon_compare_zero, neon_compare_zero_q")
+	    (const_string "neon_arith_basic")
+
+	  (eq_attr "type" "neon_add_widen, neon_arith_acc, neon_arith_acc_q,\
+			   neon_reduc_add, neon_reduc_add_q,\
+			   neon_reduc_add_acc, neon_reduc_add_acc_q,\
+			   neon_reduc_add_long, neon_add_halve_narrow_q,\
+			   neon_add_halve, neon_add_halve_q,\
+			   neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+			   neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+			   neon_qneg_q, neon_qsub, neon_qsub_q,\
+			   neon_sub_halve_narrow_q,\
+			   neon_compare, neon_compare_q,\
+			   neon_reduc_minmax, neon_reduc_minmax_q")
+	    (const_string "neon_arith_complex")
+
+	  (eq_attr "type" "neon_mul_b, neon_mul_b_q, neon_mul_h, neon_mul_h_q,\
+			   neon_mul_s, neon_mul_s_q,\
+			   neon_mul_h_scalar, neon_mul_h_scalar_q,\
+			   neon_mul_s_scalar, neon_mul_s_scalar_q,\
+			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+			   neon_sat_mul_b, neon_sat_mul_b_q,\
+			   neon_sat_mul_h, neon_sat_mul_h_q,\
+			   neon_sat_mul_s, neon_sat_mul_s_q,\
+			   neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\
+			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
+			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
+			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+			   neon_sat_mul_s_scalar_long")
+	    (const_string "neon_multiply")
+
+	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
+			   neon_mla_h_scalar, neon_mla_s_scalar,\
+			   neon_mla_b_long, neon_mla_h_long,\
+			   neon_mla_s_long,\
+			   neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+			   neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+			   neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+	    (const_string "neon_mla")
+
+	  (eq_attr "type" "neon_sat_mla_b_long, neon_sat_mla_h_long,\
+			   neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+			   neon_sat_mla_s_scalar_long")
+	    (const_string "neon_sat_mla_long")
+
+	  (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+	    (const_string "neon_shift_acc")
+
+	  (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+			   neon_shift_imm_narrow_q, neon_shift_imm_long")
+	    (const_string "neon_shift_imm_basic")
+
+	  (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+			   neon_sat_shift_imm_narrow_q")
+	    (const_string "neon_shift_imm_complex")
+
+	  (eq_attr "type" "neon_shift_reg, neon_shift_reg_q")
+	    (const_string "neon_shift_reg_basic")
+
+	  (eq_attr "type" "neon_sat_shift_reg, neon_sat_shift_reg_q")
+	    (const_string "neon_shift_reg_complex")
+
+	  (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+			   neon_fp_abs_s, neon_fp_abs_s_q,\
+			   neon_fp_neg_d, neon_fp_neg_d_q,\
+			   neon_fp_abs_d, neon_fp_abs_d_q")
+	    (const_string "neon_fp_unary")
+
+	  (eq_attr "type" "neon_fp_addsub_s, neon_fp_addsub_s_q,\
+			   neon_fp_addsub_d, neon_fp_addsub_d_q")
+	    (const_string "neon_fp_add")
+
+	  (eq_attr "type" "neon_fp_abd_s, neon_fp_abd_s_q,\
+			   neon_fp_abd_d, neon_fp_abd_d_q")
+	    (const_string "neon_fp_abd")
+
+	  (eq_attr "type" "neon_fp_compare_s, neon_fp_compare_s_q,\
+			   neon_fp_compare_d, neon_fp_compare_d_q,\
+			   neon_fp_minmax_s, neon_fp_minmax_s_q,\
+			   neon_fp_minmax_d, neon_fp_minmax_d_q")
+	    (const_string "neon_fp_compare")
+
+	  (eq_attr "type" "neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q,\
+			   neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q")
+	    (const_string "neon_fp_reduc_minmax")
+
+	  (eq_attr "type" "neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+			   neon_fp_reduc_add_d, neon_fp_reduc_add_d_q")
+	    (const_string "neon_fp_reduc_add")
+
+	  (eq_attr "type" "neon_fp_round_s, neon_fp_round_s_q,\
+			   neon_fp_round_d, neon_fp_round_d_q")
+	    (const_string "neon_fp_round")
+
+	  (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h,
+			   neon_fp_to_int_s, neon_fp_to_int_s_q,\
+			   neon_fp_to_int_d_q, neon_fp_to_int_d,\
+			   neon_int_to_fp_s, neon_int_to_fp_s_q,\
+			   neon_int_to_fp_d, neon_int_to_fp_d_q")
+	    (const_string "neon_fp_cvt")
+
+	  (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_q,\
+			   neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q,\
+			   neon_fp_mul_d, neon_fp_mul_d_q,\
+			   neon_fp_mul_d_scalar_q")
+	    (const_string "neon_fp_mul")
+
+	  (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_q,\
+			   neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
+			   neon_fp_mla_d, neon_fp_mla_d_q,\
+			   neon_fp_mla_d_scalar_q")
+	    (const_string "neon_fp_mla")
+
+	  (eq_attr "type" "neon_fp_recpe_s, neon_fp_recpe_s_q,\
+			   neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			   neon_fp_recpe_d, neon_fp_recpe_d_q,\
+			   neon_fp_rsqrte_d, neon_fp_rsqrte_d_q")
+	    (const_string "neon_fp_estimate")
+
+	  (eq_attr "type" "neon_fp_recpx_s, neon_fp_recpx_s_q,\
+			   neon_fp_recpx_d, neon_fp_recpx_d_q")
+	    (const_string "neon_fp_estimatex")
+
+	  (eq_attr "type" "neon_fp_recps_s, neon_fp_recps_s_q,\
+			   neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			   neon_fp_recps_d, neon_fp_recps_d_q,\
+			   neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")
+	    (const_string "neon_fp_step")
+
+	  (eq_attr "type" "neon_rbit, neon_rbit_q,\
+			   neon_cls, neon_cls_q, neon_cnt, neon_cnt_q,\
+			   neon_dup, neon_dup_q,\
+			   neon_rev, neon_rev_q,\
+			   neon_move, neon_move_q,
+			   neon_ext, neon_permute, neon_zip")
+	    (const_string "neon_bitops")
+
+	  (eq_attr "type" "neon_ext_q, neon_permute_q, neon_zip_q")
+	    (const_string "neon_bitops_q")
+
+	  (eq_attr "type" "neon_bsl, neon_bsl_q")
+	    (const_string "neon_bitins")
+
+	  (eq_attr "type" "neon_tbl1, neon_tbl2, neon_tbl3, neon_tbl4")
+	    (const_string "neon_tbl")
+
+	  (eq_attr "type" "neon_from_gp, neon_from_gp_q, f_mcr, f_mcrr")
+	    (const_string "neon_from_gp")
+
+	  (eq_attr "type" "neon_to_gp, neon_to_gp_q, f_mrc, f_mrrc")
+	    (const_string "neon_to_gp")
+
+	  (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q")
+	    (const_string "neon_load1_1")
+
+	  (eq_attr "type" "neon_load1_2reg, neon_load1_2reg_q")
+	    (const_string "neon_load1_2")
+
+	  (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q")
+	    (const_string "neon_load1_3")
+
+	  (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q")
+	    (const_string "neon_load1_4")
+
+	  (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q")
+	    (const_string "neon_load1_one")
+
+	  (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q")
+	    (const_string "neon_load1_all")
+
+	  (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q,\
+			   neon_load2_4reg, neon_load2_4reg_q")
+	    (const_string "neon_load2_2")
+
+	  (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q")
+	    (const_string "neon_load2_one")
+
+	  (eq_attr "type" "neon_load2_all_lanes, neon_load2_all_lanes_q")
+	    (const_string "neon_load2_all")
+
+	  (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q")
+	    (const_string "neon_load3_3")
+
+	  (eq_attr "type" "neon_load3_one_lane, neon_load3_one_lane_q")
+	    (const_string "neon_load3_one")
+
+	  (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q")
+	    (const_string "neon_load3_all")
+
+	  (eq_attr "type" "neon_load4_4reg, neon_load4_4reg_q")
+	    (const_string "neon_load4_4")
+
+	  (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+	    (const_string "neon_load4_one")
+
+	  (eq_attr "type" "neon_load4_all_lanes, neon_load4_all_lanes_q")
+	    (const_string "neon_load4_all")
+
+	  (eq_attr "type" "f_stores, f_stored,\
+			   neon_stp, neon_stp_q")
+	    (const_string "neon_store")
+
+	  (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q")
+	    (const_string "neon_store1_1")
+
+	  (eq_attr "type" "neon_store1_2reg, neon_store1_2reg_q")
+	    (const_string "neon_store1_2")
+
+	  (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q")
+	    (const_string "neon_store1_3")
+
+	  (eq_attr "type" "neon_store1_4reg, neon_store1_4reg_q")
+	    (const_string "neon_store1_4")
+
+	  (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q")
+	    (const_string "neon_store1_one")
+
+	  (eq_attr "type" "neon_store2_2reg, neon_store2_2reg_q,\
+			   neon_store2_4reg, neon_store2_4reg_q")
+	    (const_string "neon_store2_2")
+
+	  (eq_attr "type" "neon_store2_one_lane, neon_store2_one_lane_q")
+	    (const_string "neon_store2_one")
+
+	  (eq_attr "type" "neon_store3_3reg, neon_store3_3reg_q")
+	    (const_string "neon_store3_3")
+
+	  (eq_attr "type" "neon_store3_one_lane, neon_store3_one_lane_q")
+	    (const_string "neon_store3_one")
+
+	  (eq_attr "type" "neon_store4_4reg, neon_store4_4reg_q")
+	    (const_string "neon_store4_4")
+
+	  (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q")
+	    (const_string "neon_store4_one")]
+
+	  (const_string "unknown")))
+
+;; Redefine this attribute for when building the AArch64 backend.
+(define_attr "predicated" "yes,no" (const_string "no"))
+
+;; The Exynos M1 core is modeled as a triple issue pipeline that has
+;; the following functional units.
+
+;; 1.  Two pipelines for simple integer operations: A, B
+;; 2.  One pipeline for simple or complex integer operations: C
+
+(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1")
+
+(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
+(define_reservation "em1_c" "em1_xc")
+
+;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
+
+(define_cpu_unit "em1_f0, em1_f1" "exynos_m1")
+
+(define_reservation "em1_fmac" "em1_f0")
+(define_reservation "em1_fcvt" "em1_f0")
+(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
+(define_reservation "em1_nalu0" "em1_f0")
+(define_reservation "em1_nalu1" "em1_f1")
+(define_reservation "em1_nmisc" "em1_f0")
+(define_reservation "em1_ncrypt" "em1_f0")
+(define_reservation "em1_fadd" "em1_f1")
+(define_reservation "em1_fvar" "em1_f1")
+(define_reservation "em1_fst" "em1_f1")
+
+;; 4.  One pipeline for branch operations: BX
+
+(define_cpu_unit "em1_bx" "exynos_m1")
+
+(define_reservation "em1_br" "em1_bx")
+
+;; 5.  One AGU for loads: L
+;;     One AGU for stores and one pipeline for stores: S, SD
+
+(define_cpu_unit "em1_lx" "exynos_m1")
+(define_cpu_unit "em1_sx, em1_sd" "exynos_m1")
+
+(define_reservation "em1_ld" "em1_lx")
+(define_reservation "em1_st" "(em1_sx + em1_sd)")
+
+;; Branches
+;;
+;; No latency as there is no result
+;; TODO: Unconditional branches use no units;
+;; conditional branches add the BX unit;
+;; indirect branches add the C unit.
+(define_insn_reservation "exynos_m1_branch" 0
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "branch"))
+  "em1_br")
+
+(define_insn_reservation "exynos_m1_call" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "call"))
+  "em1_alu")
+
+;; Basic ALU
+;;
+;; Simple ALU without shift, non-predicated
+(define_insn_reservation "exynos_m1_alu" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_alu")
+
+;; Simple ALU without shift, predicated
+(define_insn_reservation "exynos_m1_alu_p" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_c")
+
+;; ALU ops with immediate shift
+;; TODO: if the shift value is between 0 and 3, the latency is just 1 cycle;
+;;       otherwise it takes 2 cycles and the unit is blocked;
+;;       for now, assume the latter's latency and the former's units.
+(define_insn_reservation "exynos_m1_alu_shift" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "alu_ext, alus_ext,\
+			alu_shift_imm, alus_shift_imm,\
+			logic_shift_imm, logics_shift_imm,\
+			mov_shift, mvn_shift"))
+  "(em1_alu)")
+
+;; ALU ops with register controlled shift, non-predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu * 2)")
+
+;; ALU ops with register controlled shift, predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg_p" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu, em1_c)")
+
+;; Integer multiply
+(define_insn_reservation "exynos_m1_mul" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "mul"))
+  "em1_c")
+
+;; Integer multiply-accumulate
+;; TODO: tell 32 from 64-bit ones
+(define_insn_reservation "exynos_m1_mla" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mla"))
+  "em1_c")
+
+(define_insn_reservation "exynos_m1_mlal" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mlal"))
+  "em1_alu, em1_c")
+
+;; Integer divide
+;; TODO: assume the median latency; blocks other divisions
+(define_insn_reservation "exynos_m1_div" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "udiv, sdiv"))
+  "em1_c")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to 2 words.
+(define_insn_reservation "exynos_m1_load" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load_byte, load1, load2"))
+  "em1_ld")
+
+;; Loads of 3 or 4 words.
+(define_insn_reservation "exynos_m1_loadm" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load3, load4"))
+  "(em1_ld * 3)")
+
+;; Stores of up to 2 words.
+(define_insn_reservation "exynos_m1_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store1, store2"))
+  "em1_st")
+
+;; Stores of 3 or 4 words.
+(define_insn_reservation "exynos_m1_storem" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store3, store4"))
+  "(em1_st * 3)")
+
+;; Advanced SIMD Unit
+;;
+;; Integer Arithmetic Instructions.
+
+(define_insn_reservation  "exynos_m1_arith_simple" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_simple"))
+  "em1_nmisc")
+
+(define_insn_reservation  "exynos_m1_neon_arith_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_basic"))
+  "em1_nalu")
+
+(define_insn_reservation  "exynos_m1_neon_arith_complex" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_complex"))
+  "em1_nmisc")
+
+;; Integer Multiply Instructions.
+
+(define_insn_reservation "exynos_m1_neon_multiply" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_multiply, neon_mla, neon_sat_mla_long"))
+  "em1_nmisc")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_acc" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_shift_acc"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_basic, neon_shift_reg_basic"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_complex" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_complex, neon_shift_reg_complex"))
+  "em1_nalu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_unary" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_unary"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_add"))
+  "em1_fadd")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_abd" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_abd"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_compare" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_compare"))
+  "em1_nmisc")
+
+;; TODO: the latency and throughput of reduce insns actually varies between
+;; 3-5 and 1/4-1, but picked the median values.
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_minmax"))
+  "(em1_nmisc * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc_add" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_add"))
+  "((em1_nalu * 2) + em1_fadd)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_round" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_round"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_cvt"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mul"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mla" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mla"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimate" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimate"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimatex" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimatex"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_step" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_step"))
+  "em1_fmac")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops_q" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops_q"))
+  "(em1_nalu, em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitins" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitins"))
+  "em1_nalu1")
+
+;; TODO: it is more complicated than this.
+(define_insn_reservation
+  "exynos_m1_neon_tbl" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_tbl"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_from_gp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_from_gp"))
+  "em1_st")
+
+(define_insn_reservation
+  "exynos_m1_neon_to_gp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_to_gp"))
+  "(em1_ld + em1_fst)")
+
+;; Load Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_load" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_loads, f_loadd, neon_ldp"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load_q" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_ldp_q"))
+  "(em1_ld, em1_ld)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_1" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_1, neon_load1_all"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_2" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_2"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_3" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_3"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_4" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_4"))
+  "(em1_ld * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_one"))
+  "((em1_ld * 2) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_2" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_2"))
+  "(em1_ld * 5)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_one"))
+  "((em1_ld * 2) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_all" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_all"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_3" 12
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_3"))
+  "(em1_ld * 6)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_one"))
+  "((em1_ld * 4) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_all" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_all"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_4" 14
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_4"))
+  "(em1_ld * 7)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_one"))
+  "((em1_ld * 4) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_all" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_all"))
+  "(em1_ld * 4)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_1" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_1"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_2" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_2"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_3" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_3"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_4" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_4"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store2" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store2_2, neon_store2_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store3" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store3_3, neon_store3_one"))
+  "((em1_fst + em1_nalu0), em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store4" 16
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store4_4, neon_store4_one"))
+  "((em1_fst + em1_nalu0), em1_st)")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "exynos_m1_fp_const" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fconsts, fconstd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fadds, faddd"))
+  "em1_fadd")
+
+(define_insn_reservation "exynos_m1_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmuls, fmuld"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_mac" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmacs, ffmas, fmacd, ffmad"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvt, f_rints, f_rintd"))
+  "em1_fcvt")
+
+(define_insn_reservation "exynos_m1_fp_cvt_i" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvtf2i"))
+  "(em1_ld + em1_fst + em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_i_cvt_fp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvti2f"))
+  "(em1_st + em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_fp_cmp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcmps, fcmpd"))
+  "em1_nmisc")
+
+(define_insn_reservation "exynos_m1_fp_sel" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcsel"))
+  "(em1_st + em1_nalu0)")
+
+(define_insn_reservation "exynos_m1_fp_arith" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "ffariths, ffarithd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_cpy" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmov"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_divs" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivs, neon_fp_div_s, neon_fp_div_s_q,\
+			fsqrts, neon_fp_sqrt_s, neon_fp_sqrt_s_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_divd" 22
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivd, neon_fp_div_d, neon_fp_div_d_q,\
+			fsqrtd, neon_fp_sqrt_d, neon_fp_sqrt_d_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_minmax" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_minmaxs, f_minmaxd"))
+  "(em1_nmisc * 2)")
+
+;; Crypto Operations.
+
+(define_insn_reservation "exynos_m1_crypto_simple" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_aese, crypto_aesmc,\
+			crypto_sha1_xor, crypto_sha1_fast, crypto_sha256_fast"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_complex" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_poly" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_b_long, neon_mul_h_long, neon_mul_s_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_polyl" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_d_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crc" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crc"))
+  "em1_c")
+
+;; Simple execution unit bypasses
+
+;; Pre-decrement and post-increment addressing modes update the register quickly.
+;; TODO: figure out how to tell the addressing mode register from the loaded one.
+(define_bypass 1 "exynos_m1_store*" "exynos_m1_store*")
+
+;; MLAs can feed other MLAs quickly.
+(define_bypass 1 "exynos_m1_mla*" "exynos_m1_mla")
+(define_bypass 4 "exynos_m1_mla*" "exynos_m1_mlal")
+
+;; Insns in FMAC or FADD can feed other such insns quickly.
+(define_bypass 4 "exynos_m1_fp_mul"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 5 "exynos_m1_fp_mac"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 4 "exynos_m1_neon_fp_mul"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+(define_bypass 5 "exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+(define_bypass 3 "exynos_m1_fp_add"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 3 "exynos_m1_neon_fp_add"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+;; Insns in NALU can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 3 "exynos_m1_fp_sel"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 1 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_bitops, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 3 "exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 1 "exynos_m1_neon_fp_unary" "exynos_m1_neon_fp_unary")
+
+;; Insns in NCRYPT can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_crypto_simple, exynos_m1_crypto_poly"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 3 "exynos_m1_crypto_polyl"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 5 "exynos_m1_crypto_complex"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+
+;; Predicted branches take no time, but mispredicted ones take forever anyway.
+(define_bypass 1 "exynos_m1_*"
+		 "exynos_m1_call, exynos_m1_branch")
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2016-03-16 19:48 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-27 23:38 [AArch64] Add scheduling and cost models for Exynos M1 Evandro Menezes
2015-10-28 10:40 ` James Greenhalgh
2015-10-28 10:45   ` Andrew Pinski
2015-10-28 21:58     ` Evandro Menezes
2015-10-29 23:02   ` Evandro Menezes
2015-10-28 11:01 ` Kyrill Tkachov
2015-10-29 19:38   ` Evandro Menezes
2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
2015-11-04 23:18   ` [PATCH 1/4][AArch64] " Evandro Menezes
2015-11-04 23:21     ` Evandro Menezes
2015-11-05  9:22       ` James Greenhalgh
2015-11-05 17:31         ` Evandro Menezes
2015-11-12 14:47           ` James Greenhalgh
2015-11-05 20:51   ` [PATCH 2/4][AArch64] Increase the loop peeling limit Evandro Menezes
2015-11-19 22:04     ` Evandro Menezes
2015-11-20 11:53       ` James Greenhalgh
2015-12-03 21:07         ` Evandro Menezes
2015-12-14 11:26           ` James Greenhalgh
2015-12-15 23:34             ` Evandro Menezes
2015-12-16 11:24               ` Richard Earnshaw (lists)
2015-12-16 12:42                 ` Richard Biener
2015-12-16 20:11                 ` Evandro Menezes
2016-01-08 22:55                   ` Evandro Menezes
2016-02-03 19:46                     ` Evandro Menezes
2016-03-16 19:48                       ` Evandro Menezes
2015-11-05 23:30   ` [PATCH 3/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
2015-11-05 23:30   ` Evandro Menezes
2015-11-09 23:06     ` Evandro Menezes
2015-11-10 17:50       ` [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models Evandro Menezes
2015-11-10 18:01         ` Ramana Radhakrishnan
2015-11-10 18:03           ` Ramana Radhakrishnan
2015-11-12 14:55         ` James Greenhalgh
2015-11-12 15:39           ` Evandro Menezes
2015-11-12 17:32             ` Evandro Menezes
2015-11-19 22:05               ` Evandro Menezes
2015-11-20 12:27               ` James Greenhalgh
2015-11-20 14:34                 ` Kyrill Tkachov
2015-11-20 15:56                   ` Evandro Menezes
2015-11-20 15:55                 ` Evandro Menezes
2015-11-20 16:16                   ` James Greenhalgh
2015-11-10 17:54       ` [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
2015-11-19 22:06         ` Evandro Menezes
2015-11-20 17:17         ` James Greenhalgh
2015-11-20 22:07           ` Evandro Menezes
2015-12-03 20:58           ` Evandro Menezes
2015-12-04  9:25             ` Kyrill Tkachov
2015-12-07 19:55               ` Evandro Menezes
2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
2015-11-19 22:06     ` Evandro Menezes
2015-11-20 17:19       ` James Greenhalgh
2015-11-24  9:56     ` Kyrill Tkachov
2015-12-03 20:49     ` Evandro Menezes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).