[AArch64] Add scheduling and cost models for Exynos M1

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64] Add scheduling and cost models for Exynos M1
@ 2015-10-27 23:38 Evandro Menezes
  2015-10-28 10:40 ` James Greenhalgh
                   ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-10-27 23:38 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov'

[-- Attachment #1: Type: text/plain, Size: 259 bytes --]

This patch adds the scheduling and cost models for Exynos M1.

Though it’s a rather large patch, much of it is the DFA model for the
pipeline.  Still, I’d appreciate any feedback.

Please, commit if it’s alright.

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: 0001-AArch64-Add-scheduling-and-cost-models-for-Exynos-M1.patch --]
[-- Type: application/octet-stream, Size: 48525 bytes --]

From 78919fa9e3439df140487187084142da6d0b432f Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 27 Oct 2015 16:45:17 -0500
Subject: [PATCH] [AArch64] Add scheduling and cost models for Exynos M1

2015-10-25  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched and cost
	models.
	* config/aarch64/aarch64.c (aarch64_case_values_threshold): New function.
	(exynosm1_addrcost_table): New variable.
	(exynosm1_regmove_cost): Likewise.
	(exynosm1_vector_cost): Likewise.
	(exynosm1_tunings): Likewise.
	(aarch64_override_options_internal): Tune heuristics specifically
	for Exynos M1.
	(TARGET_CASE_VALUES_THRESHOLD): Define macro.
	* config/arm/aarch-cost-tables.h (exynosm1_extra_costs): New variable.
	* config/arm/arm.c (arm_exynos_m1_tune): Likewise.
	* config/arm/arm-cores.def: Use the Exynos M1 sched and cost models.
	* config/arm/exynos-m1.md: New file.
	* config/arm/arm.md: Include new file.
---
 gcc/config/aarch64/aarch64-cores.def |   2 +-
 gcc/config/aarch64/aarch64.c         |  98 ++++
 gcc/config/arm/aarch-cost-tables.h   | 103 ++++
 gcc/config/arm/arm-cores.def         |   2 +-
 gcc/config/arm/arm.c                 |  23 +
 gcc/config/arm/arm.md                |   3 +-
 gcc/config/arm/exynos-m1.md          | 968 +++++++++++++++++++++++++++++++++++
 7 files changed, 1196 insertions(+), 3 deletions(-)
 create mode 100644 gcc/config/arm/exynos-m1.md

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 0ab1ca8..66be417 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1, "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4965041..1d13c61 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -138,6 +138,7 @@ static bool aarch64_vector_mode_supported_p (machine_mode);
 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 						 const unsigned char *sel);
 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
+static unsigned int aarch64_case_values_threshold (void);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -215,6 +216,22 @@ static const struct cpu_addrcost_table cortexa57_addrcost_table =
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table exynosm1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  1, /* register_offset  */
+  1, /* register_sextend  */
+  2, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
 static const struct cpu_addrcost_table xgene1_addrcost_table =
 {
     {
@@ -261,6 +278,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost exynosm1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost (actual, 4 and 9).  */
+  9, /* GP2FP  */
+  9, /* FP2GP  */
+  1 /* FP2FP  */
+};
+
 static const struct cpu_regmove_cost thunderx_regmove_cost =
 {
   2, /* GP2GP  */
@@ -313,6 +340,22 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   1 /* cond_not_taken_branch_cost  */
 };
 
+static const struct cpu_vector_cost exynosm1_vector_cost =
+{
+  1, /* scalar_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* vec_stmt_cost  */
+  3, /* vec_to_scalar_cost  */
+  3, /* scalar_to_vec_cost  */
+  5, /* vec_align_load_cost  */
+  5, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1 /* cond_not_taken_branch_cost  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost xgene1_vector_cost =
 {
@@ -428,6 +471,28 @@ static const struct tune_params cortexa72_tunings =
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
+static const struct tune_params exynosm1_tunings =
+{
+  &exynosm1_extra_costs,
+  &exynosm1_addrcost_table,
+  &exynosm1_regmove_cost,
+  &exynosm1_vector_cost,
+  &generic_branch_cost,
+  4,	/* memmov_cost  */
+  3,	/* issue_rate  */
+  (AARCH64_FUSE_NOTHING), /* fusible_ops  */
+  4,	/* function_align.  */
+  4,	/* jump_align.  */
+  4,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
+};
+
 static const struct tune_params thunderx_tunings =
 {
   &thunderx_extra_costs,
@@ -7672,6 +7737,22 @@ aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Adjust the heuristics for Exynos M1.  */
+  if (selected_cpu->sched_core == exynosm1)
+    {
+      /* Increase the maximum peeling limit.  */
+      maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
+                             400,
+                             opts->x_param_values,
+			     global_options_set.x_param_values);
+
+      /* Set the L1 cache line size.  */
+      maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+                             64,
+                             opts->x_param_values,
+			     global_options_set.x_param_values);
+    }
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13382,6 +13463,20 @@ aarch64_promoted_type (const_tree t)
     return float_type_node;
   return NULL_TREE;
 }
+
+/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+
+static unsigned int
+aarch64_case_values_threshold (void)
+{
+  /* For Exynos M1, raise the bar for using jump tables.  */
+  if (selected_cpu->sched_core == exynosm1
+      && optimize > 2)
+    return 48;
+  else
+    return default_case_values_threshold ();
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -13432,6 +13527,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
 
+#undef TARGET_CASE_VALUES_THRESHOLD
+#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
+
 /* Only the least significant bit is used for initialization guard
    variables.  */
 #undef TARGET_CXX_GUARD_MASK_BIT
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 66e09a8..850bde0 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -331,6 +331,109 @@ const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table exynosm1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (0), /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (3),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (35)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (21),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (34),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (0)  /* alu.  */
+  }
+};
+
 const struct cpu_cost_table xgene1_extra_costs =
 {
   /* ALU */
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 4c35200..18936f0 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index a598c84..9cd1ea2 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1991,6 +1991,29 @@ const struct tune_params arm_cortex_a57_tune =
   tune_params::SCHED_AUTOPREF_FULL
 };
 
+const struct tune_params arm_exynos_m1_tune =
+{
+  arm_9e_rtx_costs,
+  &exynosm1_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  3,						/* Issue rate.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  tune_params::FUSE_NOTHING,
+  tune_params::SCHED_AUTOPREF_OFF
+};
+
 const struct tune_params arm_xgene1_tune =
 {
   arm_9e_rtx_costs,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 02e147e..e6f07e9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -377,7 +377,7 @@
                                 arm1136jfs,cortexa5,cortexa7,cortexa8,\
                                 cortexa9,cortexa12,cortexa15,cortexa17,\
                                 cortexa53,cortexa57,cortexm4,cortexm7,\
-				marvell_pj4,xgene1")
+				exynosm1,marvell_pj4,xgene1")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -416,6 +416,7 @@
 (include "cortex-m7.md")
 (include "cortex-m4.md")
 (include "cortex-m4-fpu.md")
+(include "exynos-m1.md")
 (include "vfp11.md")
 (include "marvell-pj4.md")
 (include "xgene1.md")
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
new file mode 100644
index 0000000..05011e4
--- /dev/null
+++ b/gcc/config/arm/exynos-m1.md
@@ -0,0 +1,968 @@
+;; Samsung Exynos M1 pipeline description
+;; Copyright (C) 2014-2015 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "exynos_m1")
+
+(define_attr "exynos_m1_type"
+  "mla, mlal,
+   unknown"
+  (cond [
+	  (eq_attr "type" "mla, mlas, smlad, smladx,\
+			   smlawx, smlawy, smlaxy,\
+			   smlsd, smlsdx")
+	    (const_string "mla")
+
+	  (eq_attr "type" "smlal, smlals, smlald,\
+			   smlalxy, smlsld,\
+			   umaal, umlal, umlals")
+	    (const_string "mlal")]
+
+	  (const_string "unknown")))
+
+(define_attr "exynos_m1_neon_type"
+  "neon_arith_simple, neon_arith_basic, neon_arith_complex,
+   neon_multiply, neon_mla, neon_mla_q, neon_mla_long, neon_sat_mla_long,
+   neon_shift_acc, neon_shift_imm_basic, neon_shift_imm_complex,
+   neon_shift_reg_basic, neon_shift_reg_basic_q,
+   neon_shift_reg_complex, neon_shift_reg_complex_q,
+   neon_fp_unary, neon_fp_add, neon_fp_abd, neon_fp_compare,
+   neon_fp_reduc_minmax, neon_fp_reduc_add, neon_fp_round, neon_fp_cvt,
+   neon_fp_minmax, neon_fp_mul, neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q,
+   neon_fp_estimate, neon_fp_estimatex, neon_fp_step,
+   neon_bitops, neon_bitops_q, neon_bitins,
+   neon_to_gp, neon_from_gp, neon_move, neon_tbl,
+   neon_load1_1, neon_load1_2, neon_load1_3, neon_load1_4,
+   neon_load1_one, neon_load1_all,
+   neon_load2_2, neon_load2_one, neon_load2_all,
+   neon_load3_3, neon_load3_one, neon_load3_all,
+   neon_load4_4, neon_load4_one, neon_load4_all,
+   neon_store,
+   neon_store1_1, neon_store1_2, neon_store1_3, neon_store1_4, neon_store1_one,
+   neon_store2_2, neon_store2_one,
+   neon_store3_3, neon_store3_one,
+   neon_store4_4, neon_store4_one,
+   unknown"
+  (cond [
+	  (eq_attr "type" "neon_abd, neon_abd_q, neon_abd_long,\
+			   neon_abs, neon_abs_q,\
+			   neon_minmax, neon_minmax_q")
+	    (const_string "neon_arith_simple")
+
+	  (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
+			   neon_neg, neon_neg_q,\
+			   neon_sub, neon_sub_q, neon_sub_long, neon_sub_widen,\
+			   neon_logic, neon_logic_q, neon_tst, neon_tst_q,\
+			   neon_compare_zero, neon_compare_zero_q")
+	    (const_string "neon_arith_basic")
+
+	  (eq_attr "type" "neon_add_widen, neon_arith_acc, neon_arith_acc_q,\
+			   neon_reduc_add, neon_reduc_add_q,\
+			   neon_reduc_add_acc, neon_reduc_add_acc_q,\
+			   neon_reduc_add_long, neon_add_halve_narrow_q,\
+			   neon_add_halve, neon_add_halve_q,\
+			   neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+			   neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+			   neon_qneg_q, neon_qsub, neon_qsub_q,\
+			   neon_sub_halve_narrow_q,\
+			   neon_compare, neon_compare_q,\
+			   neon_reduc_minmax, neon_reduc_minmax_q")
+	    (const_string "neon_arith_complex")
+
+	  (eq_attr "type" "neon_mul_b, neon_mul_b_q, neon_mul_h, neon_mul_h_q,\
+			   neon_mul_s, neon_mul_s_q,\
+			   neon_mul_h_scalar, neon_mul_h_scalar_q,\
+			   neon_mul_s_scalar, neon_mul_s_scalar_q,\
+			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+			   neon_sat_mul_b, neon_sat_mul_b_q,\
+			   neon_sat_mul_h, neon_sat_mul_h_q,\
+			   neon_sat_mul_s, neon_sat_mul_s_q,\
+			   neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\
+			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
+			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
+			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+			   neon_sat_mul_s_scalar_long")
+	    (const_string "neon_multiply")
+
+	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
+			   neon_mla_h_scalar, neon_mla_s_scalar,\
+			   neon_mla_b_long, neon_mla_h_long,\
+			   neon_mla_s_long,\
+			   neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+			   neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+			   neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+	    (const_string "neon_mla")
+
+	  (eq_attr "type" "neon_sat_mla_b_long, neon_sat_mla_h_long,\
+			   neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+			   neon_sat_mla_s_scalar_long")
+	    (const_string "neon_sat_mla_long")
+
+	  (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+	    (const_string "neon_shift_acc")
+
+	  (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+			   neon_shift_imm_narrow_q, neon_shift_imm_long")
+	    (const_string "neon_shift_imm_basic")
+
+	  (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+			   neon_sat_shift_imm_narrow_q")
+	    (const_string "neon_shift_imm_complex")
+
+	  (eq_attr "type" "neon_shift_reg, neon_shift_reg_q")
+	    (const_string "neon_shift_reg_basic")
+
+	  (eq_attr "type" "neon_sat_shift_reg, neon_sat_shift_reg_q")
+	    (const_string "neon_shift_reg_complex")
+
+	  (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+			   neon_fp_abs_s, neon_fp_abs_s_q,\
+			   neon_fp_neg_d, neon_fp_neg_d_q,\
+			   neon_fp_abs_d, neon_fp_abs_d_q")
+	    (const_string "neon_fp_unary")
+
+	  (eq_attr "type" "neon_fp_addsub_s, neon_fp_addsub_s_q,\
+			   neon_fp_addsub_d, neon_fp_addsub_d_q")
+	    (const_string "neon_fp_add")
+
+	  (eq_attr "type" "neon_fp_abd_s, neon_fp_abd_s_q,\
+			   neon_fp_abd_d, neon_fp_abd_d_q")
+	    (const_string "neon_fp_abd")
+
+	  (eq_attr "type" "neon_fp_compare_s, neon_fp_compare_s_q,\
+			   neon_fp_compare_d, neon_fp_compare_d_q,\
+			   neon_fp_minmax_s, neon_fp_minmax_s_q,\
+			   neon_fp_minmax_d, neon_fp_minmax_d_q")
+	    (const_string "neon_fp_compare")
+
+	  (eq_attr "type" "neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q,\
+			   neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q")
+	    (const_string "neon_fp_reduc_minmax")
+
+	  (eq_attr "type" "neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+			   neon_fp_reduc_add_d, neon_fp_reduc_add_d_q")
+	    (const_string "neon_fp_reduc_add")
+
+	  (eq_attr "type" "neon_fp_round_s, neon_fp_round_s_q,\
+			   neon_fp_round_d, neon_fp_round_d_q")
+	    (const_string "neon_fp_round")
+
+	  (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h,
+			   neon_fp_to_int_s, neon_fp_to_int_s_q,\
+			   neon_fp_to_int_d_q, neon_fp_to_int_d,\
+			   neon_int_to_fp_s, neon_int_to_fp_s_q,\
+			   neon_int_to_fp_d, neon_int_to_fp_d_q")
+	    (const_string "neon_fp_cvt")
+
+	  (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_q,\
+			   neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q,\
+			   neon_fp_mul_d, neon_fp_mul_d_q,\
+			   neon_fp_mul_d_scalar_q")
+	    (const_string "neon_fp_mul")
+
+	  (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_q,\
+			   neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
+			   neon_fp_mla_d, neon_fp_mla_d_q,\
+			   neon_fp_mla_d_scalar_q")
+	    (const_string "neon_fp_mla")
+
+	  (eq_attr "type" "neon_fp_recpe_s, neon_fp_recpe_s_q,\
+			   neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			   neon_fp_recpe_d, neon_fp_recpe_d_q,\
+			   neon_fp_rsqrte_d, neon_fp_rsqrte_d_q")
+	    (const_string "neon_fp_estimate")
+
+	  (eq_attr "type" "neon_fp_recpx_s, neon_fp_recpx_s_q,\
+			   neon_fp_recpx_d, neon_fp_recpx_d_q")
+	    (const_string "neon_fp_estimatex")
+
+	  (eq_attr "type" "neon_fp_recps_s, neon_fp_recps_s_q,\
+			   neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			   neon_fp_recps_d, neon_fp_recps_d_q,\
+			   neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")
+	    (const_string "neon_fp_step")
+
+	  (eq_attr "type" "neon_rbit, neon_rbit_q,\
+			   neon_cls, neon_cls_q, neon_cnt, neon_cnt_q,\
+			   neon_dup, neon_dup_q,\
+			   neon_rev, neon_rev_q,\
+			   neon_move, neon_move_q,
+			   neon_ext, neon_permute, neon_zip")
+	    (const_string "neon_bitops")
+
+	  (eq_attr "type" "neon_ext_q, neon_permute_q, neon_zip_q")
+	    (const_string "neon_bitops_q")
+
+	  (eq_attr "type" "neon_bsl, neon_bsl_q")
+	    (const_string "neon_bitins")
+
+	  (eq_attr "type" "neon_tbl1, neon_tbl2, neon_tbl3, neon_tbl4")
+	    (const_string "neon_tbl")
+
+	  (eq_attr "type" "neon_from_gp, neon_from_gp_q, f_mcr, f_mcrr")
+	    (const_string "neon_from_gp")
+
+	  (eq_attr "type" "neon_to_gp, neon_to_gp_q, f_mrc, f_mrrc")
+	    (const_string "neon_to_gp")
+
+	  (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q")
+	    (const_string "neon_load1_1")
+
+	  (eq_attr "type" "neon_load1_2reg, neon_load1_2reg_q")
+	    (const_string "neon_load1_2")
+
+	  (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q")
+	    (const_string "neon_load1_3")
+
+	  (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q")
+	    (const_string "neon_load1_4")
+
+	  (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q")
+	    (const_string "neon_load1_one")
+
+	  (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q")
+	    (const_string "neon_load1_all")
+
+	  (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q,\
+			   neon_load2_4reg, neon_load2_4reg_q")
+	    (const_string "neon_load2_2")
+
+	  (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q")
+	    (const_string "neon_load2_one")
+
+	  (eq_attr "type" "neon_load2_all_lanes, neon_load2_all_lanes_q")
+	    (const_string "neon_load2_all")
+
+	  (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q")
+	    (const_string "neon_load3_3")
+
+	  (eq_attr "type" "neon_load3_one_lane, neon_load3_one_lane_q")
+	    (const_string "neon_load3_one")
+
+	  (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q")
+	    (const_string "neon_load3_all")
+
+	  (eq_attr "type" "neon_load4_4reg, neon_load4_4reg_q")
+	    (const_string "neon_load4_4")
+
+	  (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+	    (const_string "neon_load4_one")
+
+	  (eq_attr "type" "neon_load4_all_lanes, neon_load4_all_lanes_q")
+	    (const_string "neon_load4_all")
+
+	  (eq_attr "type" "f_stores, f_stored,\
+			   neon_stp, neon_stp_q")
+	    (const_string "neon_store")
+
+	  (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q")
+	    (const_string "neon_store1_1")
+
+	  (eq_attr "type" "neon_store1_2reg, neon_store1_2reg_q")
+	    (const_string "neon_store1_2")
+
+	  (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q")
+	    (const_string "neon_store1_3")
+
+	  (eq_attr "type" "neon_store1_4reg, neon_store1_4reg_q")
+	    (const_string "neon_store1_4")
+
+	  (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q")
+	    (const_string "neon_store1_one")
+
+	  (eq_attr "type" "neon_store2_2reg, neon_store2_2reg_q,\
+			   neon_store2_4reg, neon_store2_4reg_q")
+	    (const_string "neon_store2_2")
+
+	  (eq_attr "type" "neon_store2_one_lane, neon_store2_one_lane_q")
+	    (const_string "neon_store2_one")
+
+	  (eq_attr "type" "neon_store3_3reg, neon_store3_3reg_q")
+	    (const_string "neon_store3_3")
+
+	  (eq_attr "type" "neon_store3_one_lane, neon_store3_one_lane_q")
+	    (const_string "neon_store3_one")
+
+	  (eq_attr "type" "neon_store4_4reg, neon_store4_4reg_q")
+	    (const_string "neon_store4_4")
+
+	  (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q")
+	    (const_string "neon_store4_one")]
+
+	  (const_string "unknown")))
+
+;; Redefine this attribute for when building the AArch64 backend.
+(define_attr "predicated" "yes,no" (const_string "no"))
+
+;; The Exynos M1 core is modeled as a triple issue pipeline that has
+;; the following functional units.
+
+;; 1.  Two pipelines for simple integer operations: A, B
+;; 2.  One pipeline for simple or complex integer operations: C
+
+(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1")
+
+(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
+(define_reservation "em1_c" "em1_xc")
+
+;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
+
+(define_cpu_unit "em1_f0, em1_f1" "exynos_m1")
+
+(define_reservation "em1_fmac" "em1_f0")
+(define_reservation "em1_fcvt" "em1_f0")
+(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
+(define_reservation "em1_nalu0" "em1_f0")
+(define_reservation "em1_nalu1" "em1_f1")
+(define_reservation "em1_nmisc" "em1_f0")
+(define_reservation "em1_ncrypt" "em1_f0")
+(define_reservation "em1_fadd" "em1_f1")
+(define_reservation "em1_fvar" "em1_f1")
+(define_reservation "em1_fst" "em1_f1")
+
+;; 4.  One pipeline for branch operations: BX
+
+(define_cpu_unit "em1_bx" "exynos_m1")
+
+(define_reservation "em1_br" "em1_bx")
+
+;; 5.  One AGU for loads: L
+;;     One AGU for stores and one pipeline for stores: S, SD
+
+(define_cpu_unit "em1_lx" "exynos_m1")
+(define_cpu_unit "em1_sx, em1_sd" "exynos_m1")
+
+(define_reservation "em1_ld" "em1_lx")
+(define_reservation "em1_st" "(em1_sx + em1_sd)")
+
+;; Branches
+;;
+;; No latency as there is no result
+;; TODO: Unconditional branches use no units;
+;; conditional branches add the BX unit;
+;; indirect branches add the C unit.
+(define_insn_reservation "exynos_m1_branch" 0
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "branch"))
+  "em1_br")
+
+(define_insn_reservation "exynos_m1_call" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "call"))
+  "em1_alu")
+
+;; Basic ALU
+;;
+;; Simple ALU without shift, non-predicated
+(define_insn_reservation "exynos_m1_alu" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_alu")
+
+;; Simple ALU without shift, predicated
+(define_insn_reservation "exynos_m1_alu_p" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_c")
+
+;; ALU ops with immediate shift
+;; TODO: if the shift value is between 0 and 3, the latency is just 1 cycle;
+;;       otherwise it takes 2 cycles and the unit is blocked;
+;;       for now, assume the latter's latency and the former's units.
+(define_insn_reservation "exynos_m1_alu_shift" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "alu_ext, alus_ext,\
+			alu_shift_imm, alus_shift_imm,\
+			logic_shift_imm, logics_shift_imm,\
+			mov_shift, mvn_shift"))
+  "(em1_alu)")
+
+;; ALU ops with register controlled shift, non-predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu * 2)")
+
+;; ALU ops with register controlled shift, predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg_p" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu, em1_c)")
+
+;; Integer multiply
+(define_insn_reservation "exynos_m1_mul" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "mul"))
+  "em1_c")
+
+;; Integer multiply-accumulate
+;; TODO: tell 32 from 64-bit ones
+(define_insn_reservation "exynos_m1_mla" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mla"))
+  "em1_c")
+
+(define_insn_reservation "exynos_m1_mlal" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mlal"))
+  "em1_alu, em1_c")
+
+;; Integer divide
+;; TODO: assume the median latency; blocks other divisions
+(define_insn_reservation "exynos_m1_div" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "udiv, sdiv"))
+  "em1_c")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to 2 words.
+(define_insn_reservation "exynos_m1_load" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load_byte, load1, load2"))
+  "em1_ld")
+
+;; Loads of 3 or 4 words.
+(define_insn_reservation "exynos_m1_loadm" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load3, load4"))
+  "(em1_ld * 3)")
+
+;; Stores of up to 2 words.
+(define_insn_reservation "exynos_m1_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store1, store2"))
+  "em1_st")
+
+;; Stores of 3 or 4 words.
+(define_insn_reservation "exynos_m1_storem" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store3, store4"))
+  "(em1_st * 3)")
+
+;; Advanced SIMD Unit
+;;
+;; Integer Arithmetic Instructions.
+
+(define_insn_reservation  "exynos_m1_arith_simple" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_simple"))
+  "em1_nmisc")
+
+(define_insn_reservation  "exynos_m1_neon_arith_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_basic"))
+  "em1_nalu")
+
+(define_insn_reservation  "exynos_m1_neon_arith_complex" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_complex"))
+  "em1_nmisc")
+
+;; Integer Multiply Instructions.
+
+(define_insn_reservation "exynos_m1_neon_multiply" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_multiply, neon_mla, neon_sat_mla_long"))
+  "em1_nmisc")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_acc" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_shift_acc"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_basic, neon_shift_reg_basic"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_complex" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_complex, neon_shift_reg_complex"))
+  "em1_nalu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_unary" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_unary"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_add"))
+  "em1_fadd")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_abd" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_abd"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_compare" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_compare"))
+  "em1_nmisc")
+
+;; TODO: the latency and throughput of reduce insns actually varies between
+;; 3-5 and 1/4-1, but picked the median values.
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_minmax"))
+  "(em1_nmisc * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc_add" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_add"))
+  "((em1_nalu * 2) + em1_fadd)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_round" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_round"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_cvt"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mul"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mla" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mla"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimate" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimate"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimatex" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimatex"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_step" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_step"))
+  "em1_fmac")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops_q" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops_q"))
+  "(em1_nalu, em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitins" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitins"))
+  "em1_nalu1")
+
+;; TODO: it is more complicated than this.
+(define_insn_reservation
+  "exynos_m1_neon_tbl" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_tbl"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_from_gp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_from_gp"))
+  "em1_st")
+
+(define_insn_reservation
+  "exynos_m1_neon_to_gp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_to_gp"))
+  "(em1_ld + em1_fst)")
+
+;; Load Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_load" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_loads, f_loadd, neon_ldp"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load_q" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_ldp_q"))
+  "(em1_ld, em1_ld)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_1" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_1, neon_load1_all"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_2" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_2"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_3" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_3"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_4" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_4"))
+  "(em1_ld * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_one"))
+  "((em1_ld * 2) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_2" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_2"))
+  "(em1_ld * 5)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_one"))
+  "((em1_ld * 2) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_all" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_all"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_3" 12
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_3"))
+  "(em1_ld * 6)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_one"))
+  "((em1_ld * 4) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_all" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_all"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_4" 14
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_4"))
+  "(em1_ld * 7)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_one"))
+  "((em1_ld * 4) + em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_all" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_all"))
+  "(em1_ld * 4)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_1" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_1"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_2" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_2"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_3" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_3"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_4" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_4"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store2" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store2_2, neon_store2_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store3" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store3_3, neon_store3_one"))
+  "((em1_fst + em1_nalu0), em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store4" 16
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store4_4, neon_store4_one"))
+  "((em1_fst + em1_nalu0), em1_st)")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "exynos_m1_fp_const" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fconsts, fconstd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fadds, faddd"))
+  "em1_fadd")
+
+(define_insn_reservation "exynos_m1_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmuls, fmuld"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_mac" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmacs, ffmas, fmacd, ffmad"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvt, f_rints, f_rintd"))
+  "em1_fcvt")
+
+(define_insn_reservation "exynos_m1_fp_cvt_i" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvtf2i"))
+  "(em1_ld + em1_fst + em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_i_cvt_fp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvti2f"))
+  "(em1_st + em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_fp_cmp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcmps, fcmpd"))
+  "em1_nmisc")
+
+(define_insn_reservation "exynos_m1_fp_sel" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcsel"))
+  "(em1_st + em1_nalu0)")
+
+(define_insn_reservation "exynos_m1_fp_arith" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "ffariths, ffarithd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_cpy" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmov"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_divs" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivs, neon_fp_div_s, neon_fp_div_s_q,\
+			fsqrts, neon_fp_sqrt_s, neon_fp_sqrt_s_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_divd" 22
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivd, neon_fp_div_d, neon_fp_div_d_q,\
+			fsqrtd, neon_fp_sqrt_d, neon_fp_sqrt_d_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_minmax" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_minmaxs, f_minmaxd"))
+  "(em1_nmisc * 2)")
+
+;; Crypto Operations.
+
+(define_insn_reservation "exynos_m1_crypto_simple" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_aese, crypto_aesmc,\
+			crypto_sha1_xor, crypto_sha1_fast, crypto_sha256_fast"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_complex" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_poly" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_b_long, neon_mul_h_long, neon_mul_s_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_polyl" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_d_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crc" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crc"))
+  "em1_c")
+
+;; Simple execution unit bypasses
+
+;; Pre-decrement and post-increment addressing modes update the register quickly.
+;; TODO: figure out how to tell the addressing mode register from the loaded one.
+(define_bypass 1 "exynos_m1_store*" "exynos_m1_store*")
+
+;; MLAs can feed other MLAs quickly.
+(define_bypass 1 "exynos_m1_mla*" "exynos_m1_mla")
+(define_bypass 4 "exynos_m1_mla*" "exynos_m1_mlal")
+
+;; Insns in FMAC or FADD can feed other such insns quickly.
+(define_bypass 4 "exynos_m1_fp_mul"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 5 "exynos_m1_fp_mac"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 4 "exynos_m1_neon_fp_mul"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+(define_bypass 5 "exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+(define_bypass 3 "exynos_m1_fp_add"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 3 "exynos_m1_neon_fp_add"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+;; Insns in NALU can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 3 "exynos_m1_fp_sel"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 1 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_bitops, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 3 "exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 1 "exynos_m1_neon_fp_unary" "exynos_m1_neon_fp_unary")
+
+;; Insns in NCRYPT can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_crypto_simple, exynos_m1_crypto_poly"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 3 "exynos_m1_crypto_polyl"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 5 "exynos_m1_crypto_complex"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+
+;; Predicted branches take no time, but mispredicted ones take forever anyway.
+(define_bypass 1 "exynos_m1_*"
+		 "exynos_m1_call, exynos_m1_branch")
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [AArch64] Add scheduling and cost models for Exynos M1
  2015-10-27 23:38 [AArch64] Add scheduling and cost models for Exynos M1 Evandro Menezes
@ 2015-10-28 10:40 ` James Greenhalgh
  2015-10-28 10:45   ` Andrew Pinski
  2015-10-29 23:02   ` Evandro Menezes
  2015-10-28 11:01 ` Kyrill Tkachov
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
  2 siblings, 2 replies; 52+ messages in thread
From: James Greenhalgh @ 2015-10-28 10:40 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov'

On Tue, Oct 27, 2015 at 06:12:48PM -0500, Evandro Menezes wrote:
> This patch adds the scheduling and cost models for Exynos M1.
> 
> Though it?s a rather large patch, much of it is the DFA model for the
> pipeline.? Still, I?d appreciate any feedback.
> 
> Please, commit if it?s alright.

Hi Evandro,

Thanks for the patch, I have some comments.

To ease review, could I ask you to turn this in to a patch series? Roughly
structured as so:

  1/4: Add the Exynos-M1 cost models.
  2/4: Add the Exynos M1 scheduling model.
  3/4: Add the infrastructure for TARGET_CASE_VALUES_THRESHOLD.
  4/4: Add the extra tuning heuristics.

Your support is missing a critical hunk for AArch64, there should be an

  (include "../arm/exynos-m1.md")

in aarch64.md to get this working.

This is a fairly large pipeline description (add (automata_option "stats")
to the .md file):

 Automaton `exynos_m1' 
    62320 NDFA states,          489094 NDFA arcs

From experience, you get little benefit from such a complex model, but you
do slow bootstrap times. It isn't for me to say where the model can be
trimmed (I don't have access to documentation for the Exynos-M1), but
you may find it useful to split out the SIMD/FP automaton, and look at whether
your modelling of long latency instructions is entirely neccesary. Have a
look at the Cortex-A57 and Cortex-A53 for some examples of what I mean.

For comparison, here are the stats for Cortex-A53 and Cortex-A57:

 Automaton `cortex_a53'
    281 NDFA states,           1158 NDFA arcs
 Automaton `cortex_a53_advsimd'
    9072 NDFA states,          49572 NDFA arcs
 Automaton `cortex_a57'
    764 NDFA states,           3600 NDFA arcs 
 Automaton `cortex_a57_cx'
    204 NDFA states,            864 NDFA arcs 

> @@ -7672,6 +7737,22 @@ aarch64_override_options_internal (struct gcc_options *opts)
>  			 opts->x_param_values,
>  			 global_options_set.x_param_values);
>  
> +  /* Adjust the heuristics for Exynos M1.  */
> +  if (selected_cpu->sched_core == exynosm1)

I think it would be preferable to pull these tuning parameters in to
the target structures somehow, rather than guarding them off by specific
CPUs.

> +    {
> +      /* Increase the maximum peeling limit.  */
> +      maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
> +                             400,
> +                             opts->x_param_values,
> +			     global_options_set.x_param_values);
> +
> +      /* Set the L1 cache line size.  */
> +      maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
> +                             64,
> +                             opts->x_param_values,
> +			     global_options_set.x_param_values);
> +    }
> +
>    aarch64_override_options_after_change_1 (opts);
>  }
>  

> @@ -13382,6 +13463,20 @@ aarch64_promoted_type (const_tree t)
>      return float_type_node;
>    return NULL_TREE;
>  }
> +
> +/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
> +
> +static unsigned int
> +aarch64_case_values_threshold (void)
> +{
> +  /* For Exynos M1, raise the bar for using jump tables.  */
> +  if (selected_cpu->sched_core == exynosm1
> +      && optimize > 2)
> +    return 48;

Likewise, I think this should end up in the per-core tuning structures
rather than masked off by selected_cpu->sched_core == exynosm1.

> +  else
> +    return default_case_values_threshold ();
> +}
> +
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST aarch64_address_cost
>  

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [AArch64] Add scheduling and cost models for Exynos M1
  2015-10-28 10:40 ` James Greenhalgh
@ 2015-10-28 10:45   ` Andrew Pinski
  2015-10-28 21:58     ` Evandro Menezes
  2015-10-29 23:02   ` Evandro Menezes
  1 sibling, 1 reply; 52+ messages in thread
From: Andrew Pinski @ 2015-10-28 10:45 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: Evandro Menezes, gcc-patches, Marcus Shawcroft, Kyrill Tkachov

On Wed, Oct 28, 2015 at 6:36 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Tue, Oct 27, 2015 at 06:12:48PM -0500, Evandro Menezes wrote:
>> This patch adds the scheduling and cost models for Exynos M1.
>>
>> Though it?s a rather large patch, much of it is the DFA model for the
>> pipeline.? Still, I?d appreciate any feedback.
>>
>> Please, commit if it?s alright.
>
> Hi Evandro,
>
> Thanks for the patch, I have some comments.
>
> To ease review, could I ask you to turn this in to a patch series? Roughly
> structured as so:
>
>   1/4: Add the Exynos-M1 cost models.
>   2/4: Add the Exynos M1 scheduling model.
>   3/4: Add the infrastructure for TARGET_CASE_VALUES_THRESHOLD.
>   4/4: Add the extra tuning heuristics.
>
> Your support is missing a critical hunk for AArch64, there should be an
>
>   (include "../arm/exynos-m1.md")
>
> in aarch64.md to get this working.
>
> This is a fairly large pipeline description (add (automata_option "stats")
> to the .md file):
>
>  Automaton `exynos_m1'
>     62320 NDFA states,          489094 NDFA arcs
>
> From experience, you get little benefit from such a complex model, but you
> do slow bootstrap times. It isn't for me to say where the model can be
> trimmed (I don't have access to documentation for the Exynos-M1), but
> you may find it useful to split out the SIMD/FP automaton, and look at whether
> your modelling of long latency instructions is entirely neccesary. Have a
> look at the Cortex-A57 and Cortex-A53 for some examples of what I mean.
>
> For comparison, here are the stats for Cortex-A53 and Cortex-A57:
>
>  Automaton `cortex_a53'
>     281 NDFA states,           1158 NDFA arcs
>  Automaton `cortex_a53_advsimd'
>     9072 NDFA states,          49572 NDFA arcs
>  Automaton `cortex_a57'
>     764 NDFA states,           3600 NDFA arcs
>  Automaton `cortex_a57_cx'
>     204 NDFA states,            864 NDFA arcs
>
>> @@ -7672,6 +7737,22 @@ aarch64_override_options_internal (struct gcc_options *opts)
>>                        opts->x_param_values,
>>                        global_options_set.x_param_values);
>>
>> +  /* Adjust the heuristics for Exynos M1.  */
>> +  if (selected_cpu->sched_core == exynosm1)
>
> I think it would be preferable to pull these tuning parameters in to
> the target structures somehow, rather than guarding them off by specific
> CPUs.
>
>> +    {
>> +      /* Increase the maximum peeling limit.  */
>> +      maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
>> +                             400,
>> +                             opts->x_param_values,
>> +                          global_options_set.x_param_values);
>> +
>> +      /* Set the L1 cache line size.  */
>> +      maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
>> +                             64,
>> +                             opts->x_param_values,
>> +                          global_options_set.x_param_values);
>> +    }
>> +
>>    aarch64_override_options_after_change_1 (opts);
>>  }

I have a patch for the L1 cache size (prefetch) infastructure which
sets it via tunning parameters but I have not had time to submit it
yet.
Also Peeling parameter changes helps ThunderX too.

Thanks,
Andrew

>>
>
>
>> @@ -13382,6 +13463,20 @@ aarch64_promoted_type (const_tree t)
>>      return float_type_node;
>>    return NULL_TREE;
>>  }
>> +
>> +/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
>> +
>> +static unsigned int
>> +aarch64_case_values_threshold (void)
>> +{
>> +  /* For Exynos M1, raise the bar for using jump tables.  */
>> +  if (selected_cpu->sched_core == exynosm1
>> +      && optimize > 2)
>> +    return 48;
>
> Likewise, I think this should end up in the per-core tuning structures
> rather than masked off by selected_cpu->sched_core == exynosm1.
>
>> +  else
>> +    return default_case_values_threshold ();
>> +}
>> +
>>  #undef TARGET_ADDRESS_COST
>>  #define TARGET_ADDRESS_COST aarch64_address_cost
>>
>
> Thanks,
> James
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [AArch64] Add scheduling and cost models for Exynos M1
  2015-10-27 23:38 [AArch64] Add scheduling and cost models for Exynos M1 Evandro Menezes
  2015-10-28 10:40 ` James Greenhalgh
@ 2015-10-28 11:01 ` Kyrill Tkachov
  2015-10-29 19:38   ` Evandro Menezes
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
  2 siblings, 1 reply; 52+ messages in thread
From: Kyrill Tkachov @ 2015-10-28 11:01 UTC (permalink / raw)
  To: Evandro Menezes, 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh'

Hi Evandro,

On 27/10/15 23:12, Evandro Menezes wrote:
> This patch adds the scheduling and cost models for Exynos M1.
>
> Though it’s a rather large patch, much of it is the DFA model for the
> pipeline.  Still, I’d appreciate any feedback.
>
> Please, commit if it’s alright.
>
> Thank you,
>

On top of James' comments about splitting this up,

2015-10-25  Evandro Menezes<e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched and cost
	models.
	* config/aarch64/aarch64.c (aarch64_case_values_threshold): New function.
	(exynosm1_addrcost_table): New variable.
	(exynosm1_regmove_cost): Likewise.
	(exynosm1_vector_cost): Likewise.
	(exynosm1_tunings): Likewise.
	(aarch64_override_options_internal): Tune heuristics specifically
	for Exynos M1.
	(TARGET_CASE_VALUES_THRESHOLD): Define macro.
	* config/arm/aarch-cost-tables.h (exynosm1_extra_costs): New variable.
	* config/arm/arm.c (arm_exynos_m1_tune): Likewise.
	* config/arm/arm-cores.def: Use the Exynos M1 sched and cost models.
	* config/arm/exynos-m1.md: New file.
	* config/arm/arm.md: Include new file.


Note that on arm you also need to say that config/arm/arm-tune.md is regenerated.
I know, the arm .md files need updating in many places when adding a new core.
Would be nice to consolidate all of that at some point.

Also, can you please confirm that you tested this on an arm target as well as
an aarch64 one?

Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [AArch64] Add scheduling and cost models for Exynos M1
  2015-10-28 10:45   ` Andrew Pinski
@ 2015-10-28 21:58     ` Evandro Menezes
  0 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-10-28 21:58 UTC (permalink / raw)
  To: Andrew Pinski, James Greenhalgh
  Cc: gcc-patches, Marcus Shawcroft, Kyrill Tkachov

Andrew,

I need to do more investigation WRT prefetching, especially Kyrill's 
recent patch.

The change to the peeling limit also benefits A57.  If James or Marcus 
could confirm this, I'd be glad to add it to all AArch64 targets, 
assuming that Xgene would be fine with it too.

Thank you,

-- 
Evandro Menezes

On 10/28/2015 05:40 AM, Andrew Pinski wrote:
> On Wed, Oct 28, 2015 at 6:36 PM, James Greenhalgh
> <james.greenhalgh@arm.com> wrote:
>> On Tue, Oct 27, 2015 at 06:12:48PM -0500, Evandro Menezes wrote:
>>> This patch adds the scheduling and cost models for Exynos M1.
>>>
>>> Though it?s a rather large patch, much of it is the DFA model for the
>>> pipeline.? Still, I?d appreciate any feedback.
>>>
>>> Please, commit if it?s alright.
>> Hi Evandro,
>>
>> Thanks for the patch, I have some comments.
>>
>> To ease review, could I ask you to turn this in to a patch series? Roughly
>> structured as so:
>>
>>    1/4: Add the Exynos-M1 cost models.
>>    2/4: Add the Exynos M1 scheduling model.
>>    3/4: Add the infrastructure for TARGET_CASE_VALUES_THRESHOLD.
>>    4/4: Add the extra tuning heuristics.
>>
>> Your support is missing a critical hunk for AArch64, there should be an
>>
>>    (include "../arm/exynos-m1.md")
>>
>> in aarch64.md to get this working.
>>
>> This is a fairly large pipeline description (add (automata_option "stats")
>> to the .md file):
>>
>>   Automaton `exynos_m1'
>>      62320 NDFA states,          489094 NDFA arcs
>>
>>  From experience, you get little benefit from such a complex model, but you
>> do slow bootstrap times. It isn't for me to say where the model can be
>> trimmed (I don't have access to documentation for the Exynos-M1), but
>> you may find it useful to split out the SIMD/FP automaton, and look at whether
>> your modelling of long latency instructions is entirely neccesary. Have a
>> look at the Cortex-A57 and Cortex-A53 for some examples of what I mean.
>>
>> For comparison, here are the stats for Cortex-A53 and Cortex-A57:
>>
>>   Automaton `cortex_a53'
>>      281 NDFA states,           1158 NDFA arcs
>>   Automaton `cortex_a53_advsimd'
>>      9072 NDFA states,          49572 NDFA arcs
>>   Automaton `cortex_a57'
>>      764 NDFA states,           3600 NDFA arcs
>>   Automaton `cortex_a57_cx'
>>      204 NDFA states,            864 NDFA arcs
>>
>>> @@ -7672,6 +7737,22 @@ aarch64_override_options_internal (struct gcc_options *opts)
>>>                         opts->x_param_values,
>>>                         global_options_set.x_param_values);
>>>
>>> +  /* Adjust the heuristics for Exynos M1.  */
>>> +  if (selected_cpu->sched_core == exynosm1)
>> I think it would be preferable to pull these tuning parameters in to
>> the target structures somehow, rather than guarding them off by specific
>> CPUs.
>>
>>> +    {
>>> +      /* Increase the maximum peeling limit.  */
>>> +      maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
>>> +                             400,
>>> +                             opts->x_param_values,
>>> +                          global_options_set.x_param_values);
>>> +
>>> +      /* Set the L1 cache line size.  */
>>> +      maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
>>> +                             64,
>>> +                             opts->x_param_values,
>>> +                          global_options_set.x_param_values);
>>> +    }
>>> +
>>>     aarch64_override_options_after_change_1 (opts);
>>>   }
> I have a patch for the L1 cache size (prefetch) infastructure which
> sets it via tunning parameters but I have not had time to submit it
> yet.
> Also Peeling parameter changes helps ThunderX too.
>
> Thanks,
> Andrew
>
>>
>>> @@ -13382,6 +13463,20 @@ aarch64_promoted_type (const_tree t)
>>>       return float_type_node;
>>>     return NULL_TREE;
>>>   }
>>> +
>>> +/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
>>> +
>>> +static unsigned int
>>> +aarch64_case_values_threshold (void)
>>> +{
>>> +  /* For Exynos M1, raise the bar for using jump tables.  */
>>> +  if (selected_cpu->sched_core == exynosm1
>>> +      && optimize > 2)
>>> +    return 48;
>> Likewise, I think this should end up in the per-core tuning structures
>> rather than masked off by selected_cpu->sched_core == exynosm1.
>>
>>> +  else
>>> +    return default_case_values_threshold ();
>>> +}
>>> +
>>>   #undef TARGET_ADDRESS_COST
>>>   #define TARGET_ADDRESS_COST aarch64_address_cost
>>>
>> Thanks,
>> James
>>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [AArch64] Add scheduling and cost models for Exynos M1
  2015-10-28 11:01 ` Kyrill Tkachov
@ 2015-10-29 19:38   ` Evandro Menezes
  0 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-10-29 19:38 UTC (permalink / raw)
  To: Kyrill Tkachov, 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh'

Hi, Kyrill.

True dat, I missed arm-tune.md.

And, yes, bootstrapped cross and natively on AArch64 and cross on ARM.

Thank you,

-- 
Evandro Menezes

On 10/28/2015 05:57 AM, Kyrill Tkachov wrote:
> Hi Evandro,
>
> On 27/10/15 23:12, Evandro Menezes wrote:
>> This patch adds the scheduling and cost models for Exynos M1.
>>
>> Though itâ€™s a rather large patch, much of it is the DFA model for the
>> pipeline.  Still, Iâ€™d appreciate any feedback.
>>
>> Please, commit if itâ€™s alright.
>>
>> Thank you,
>>
>
> On top of James' comments about splitting this up,
>
> 2015-10-25  Evandro Menezes<e.menezes@samsung.com>
>
> gcc/
>     * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched and cost
>     models.
>     * config/aarch64/aarch64.c (aarch64_case_values_threshold): New 
> function.
>     (exynosm1_addrcost_table): New variable.
>     (exynosm1_regmove_cost): Likewise.
>     (exynosm1_vector_cost): Likewise.
>     (exynosm1_tunings): Likewise.
>     (aarch64_override_options_internal): Tune heuristics specifically
>     for Exynos M1.
>     (TARGET_CASE_VALUES_THRESHOLD): Define macro.
>     * config/arm/aarch-cost-tables.h (exynosm1_extra_costs): New 
> variable.
>     * config/arm/arm.c (arm_exynos_m1_tune): Likewise.
>     * config/arm/arm-cores.def: Use the Exynos M1 sched and cost models.
>     * config/arm/exynos-m1.md: New file.
>     * config/arm/arm.md: Include new file.
>
>
> Note that on arm you also need to say that config/arm/arm-tune.md is 
> regenerated.
> I know, the arm .md files need updating in many places when adding a 
> new core.
> Would be nice to consolidate all of that at some point.
>
> Also, can you please confirm that you tested this on an arm target as 
> well as
> an aarch64 one?
>
> Thanks,
> Kyrill
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [AArch64] Add scheduling and cost models for Exynos M1
  2015-10-28 10:40 ` James Greenhalgh
  2015-10-28 10:45   ` Andrew Pinski
@ 2015-10-29 23:02   ` Evandro Menezes
  1 sibling, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-10-29 23:02 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov'

Hi, James.

On 10/28/2015 05:36 AM, James Greenhalgh wrote:
> On Tue, Oct 27, 2015 at 06:12:48PM -0500, Evandro Menezes wrote:
>> This patch adds the scheduling and cost models for Exynos M1.
>>
>> Though it?s a rather large patch, much of it is the DFA model for the
>> pipeline.? Still, I?d appreciate any feedback.
>>
>> Please, commit if it?s alright.
> Hi Evandro,
>
> Thanks for the patch, I have some comments.
>
> To ease review, could I ask you to turn this in to a patch series? Roughly
> structured as so:
>
>    1/4: Add the Exynos-M1 cost models.
>    2/4: Add the Exynos M1 scheduling model.
>    3/4: Add the infrastructure for TARGET_CASE_VALUES_THRESHOLD.
>    4/4: Add the extra tuning heuristics.
Will do.
> Your support is missing a critical hunk for AArch64, there should be an
>
>    (include "../arm/exynos-m1.md")
>
> in aarch64.md to get this working.
>
> This is a fairly large pipeline description (add (automata_option "stats")
> to the .md file):
>
>   Automaton `exynos_m1'
>      62320 NDFA states,          489094 NDFA arcs
>
> >From experience, you get little benefit from such a complex model, but you
> do slow bootstrap times. It isn't for me to say where the model can be
> trimmed (I don't have access to documentation for the Exynos-M1), but
> you may find it useful to split out the SIMD/FP automaton, and look at whether
> your modelling of long latency instructions is entirely neccesary. Have a
> look at the Cortex-A57 and Cortex-A53 for some examples of what I mean.
>
> For comparison, here are the stats for Cortex-A53 and Cortex-A57:
>
>   Automaton `cortex_a53'
>      281 NDFA states,           1158 NDFA arcs
>   Automaton `cortex_a53_advsimd'
>      9072 NDFA states,          49572 NDFA arcs
>   Automaton `cortex_a57'
>      764 NDFA states,           3600 NDFA arcs
>   Automaton `cortex_a57_cx'
>      204 NDFA states,            864 NDFA arcs
Splitting the automaton in two, one for generic and the other for FP, 
brought the DFA down to 1520 states and 10170 arcs and 50 states and 174 
arcs.  These figures seem reasonable to me, but I'll have to test this 
split further.
>> @@ -7672,6 +7737,22 @@ aarch64_override_options_internal (struct gcc_options *opts)
>>   			 opts->x_param_values,
>>   			 global_options_set.x_param_values);
>>   
>> +  /* Adjust the heuristics for Exynos M1.  */
>> +  if (selected_cpu->sched_core == exynosm1)
> I think it would be preferable to pull these tuning parameters in to
> the target structures somehow, rather than guarding them off by specific
> CPUs.
>> +    {
>> +      /* Increase the maximum peeling limit.  */
>> +      maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
>> +                             400,
>> +                             opts->x_param_values,
>> +			     global_options_set.x_param_values);
>> +
Actually, I observed that increasing the peeling limit is also 
beneficial for A57.  I didn't try A53, but I guess that it wouldn't hurt 
it.  Given than this also benefits ThunderX, I wonder if this override 
could be applied always for AArch64.
>> +      /* Set the L1 cache line size.  */
>> +      maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
>> +                             64,
>> +                             opts->x_param_values,
>> +			     global_options_set.x_param_values);
>> +    }
>> +
>>     aarch64_override_options_after_change_1 (opts);
>>   }
Is adding a new member to tune_param the right place to add the line 
size for each CPU?
>> @@ -13382,6 +13463,20 @@ aarch64_promoted_type (const_tree t)
>>       return float_type_node;
>>     return NULL_TREE;
>>   }
>> +
>> +/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
>> +
>> +static unsigned int
>> +aarch64_case_values_threshold (void)
>> +{
>> +  /* For Exynos M1, raise the bar for using jump tables.  */
>> +  if (selected_cpu->sched_core == exynosm1
>> +      && optimize > 2)
>> +    return 48;
> Likewise, I think this should end up in the per-core tuning structures
> rather than masked off by selected_cpu->sched_core == exynosm1.
>> +  else
>> +    return default_case_values_threshold ();
>> +}
>> +
>>   #undef TARGET_ADDRESS_COST
>>   #define TARGET_ADDRESS_COST aarch64_address_cost
> Thanks,
> James
Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 0/4][AArch64] Add scheduling and cost models for Exynos M1
  2015-10-27 23:38 [AArch64] Add scheduling and cost models for Exynos M1 Evandro Menezes
  2015-10-28 10:40 ` James Greenhalgh
  2015-10-28 11:01 ` Kyrill Tkachov
@ 2015-11-04 23:10 ` Evandro Menezes
  2015-11-04 23:18   ` [PATCH 1/4][AArch64] " Evandro Menezes
                     ` (4 more replies)
  2 siblings, 5 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-04 23:10 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

Following the suggestions to add the support for the Exynos M1 models, 
the following series of patches are broken down into:

  * add more target specific tuning data
  * add heuristics tuning
  * add the Exynos M1 cost model
  * add the Exynos M1 scheduling model

Thank you,

-- 
Evandro Menezes


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 1/4][AArch64] Add scheduling and cost models for Exynos M1
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
@ 2015-11-04 23:18   ` Evandro Menezes
  2015-11-04 23:21     ` Evandro Menezes
  2015-11-05 20:51   ` [PATCH 2/4][AArch64] Increase the loop peeling limit Evandro Menezes
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-04 23:18 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 660 bytes --]

This patch adds extra tuning information about AArch64 targets:

  * Maximum number of case values before resorting to a jump table
    The default values assumed independently of the specific backends
    may be rather low for modern processors, which sport quite efficient
    direct branch prediction, whereas indirect branch prediction is
    still typically not so efficient.  This value may be specifically
    set for a processor or left at zero to use the default values.
  * L1 cache line size
    The auto-prefetcher uses this information when emitting software
    prefetch insns.

Please, commit if it's alright.

Thank you,

-- 
Evandro Menezes



[-- Attachment #2: target.patch --]
[-- Type: text/x-patch, Size: 46004 bytes --]

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 0ab1ca8..66be417 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1, "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 81792bc..ecf4685 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -195,6 +195,9 @@ struct tune_params
   int vec_reassoc_width;
   int min_div_recip_mul_sf;
   int min_div_recip_mul_df;
+  int max_case_values; /* Case values threshold; or 0 for the default.  */
+
+  int cache_line_size; /* Cache line size; or 0 for the default.  */
 
 /* An enum specifying how to take into account CPU autoprefetch capabilities
    during instruction scheduling:
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5c8604f..e7f1c07 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -355,6 +355,8 @@ static const struct tune_params generic_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -378,6 +380,8 @@ static const struct tune_params cortexa53_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -401,6 +405,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
 };
@@ -424,6 +430,8 @@ static const struct tune_params cortexa72_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -446,6 +454,8 @@ static const struct tune_params thunderx_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -468,6 +478,8 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -3242,6 +3254,20 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
   return aarch64_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+
+static unsigned int
+aarch64_case_values_threshold (void)
+{
+  /* Use the specified limit for the number of cases before using jump
+     tables at higher optimization levels.  */
+  if (optimize > 2
+      && selected_cpu->tune->max_case_values != 0)
+    return selected_cpu->tune->max_case_values;
+  else
+    return default_case_values_threshold ();
+}
+
 /* Return true if register REGNO is a valid index register.
    STRICT_P is true if REG_OK_STRICT is in effect.  */
 
@@ -7672,6 +7698,13 @@ aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Set the L1 cache line size.  */
+  if (selected_cpu->tune->cache_line_size != 0)
+    maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			   selected_cpu->tune->cache_line_size,
+			   opts->x_param_values,
+			   global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13385,6 +13418,7 @@ aarch64_promoted_type (const_tree t)
     return float_type_node;
   return NULL_TREE;
 }
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -13432,6 +13466,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_CANNOT_FORCE_CONST_MEM
 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
 
+#undef TARGET_CASE_VALUES_THRESHOLD
+#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
+
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 9c4eb52..d526b52 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -202,6 +202,7 @@
 ;; Scheduling
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a57.md")
+(include "../arm/exynos-m1.md")
 (include "thunderx.md")
 (include "../arm/xgene1.md")
 
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 66e09a8..850bde0 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -331,6 +331,109 @@ const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table exynosm1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (0), /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (3),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (35)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (21),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (34),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (0)  /* alu.  */
+  }
+};
+
 const struct cpu_cost_table xgene1_extra_costs =
 {
   /* ALU */
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 4c35200..18936f0 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 4310638..6d5a64e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1991,6 +1991,29 @@ const struct tune_params arm_cortex_a57_tune =
   tune_params::SCHED_AUTOPREF_FULL
 };
 
+const struct tune_params arm_exynos_m1_tune =
+{
+  arm_9e_rtx_costs,
+  &exynosm1_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  3,						/* Issue rate.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  tune_params::FUSE_NOTHING,
+  tune_params::SCHED_AUTOPREF_OFF
+};
+
 const struct tune_params arm_xgene1_tune =
 {
   arm_9e_rtx_costs,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 02e147e..e6f07e9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -377,7 +377,7 @@
                                 arm1136jfs,cortexa5,cortexa7,cortexa8,\
                                 cortexa9,cortexa12,cortexa15,cortexa17,\
                                 cortexa53,cortexa57,cortexm4,cortexm7,\
-				marvell_pj4,xgene1")
+				exynosm1,marvell_pj4,xgene1")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -416,6 +416,7 @@
 (include "cortex-m7.md")
 (include "cortex-m4.md")
 (include "cortex-m4-fpu.md")
+(include "exynos-m1.md")
 (include "vfp11.md")
 (include "marvell-pj4.md")
 (include "xgene1.md")
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
new file mode 100644
index 0000000..a4f1138
--- /dev/null
+++ b/gcc/config/arm/exynos-m1.md
@@ -0,0 +1,974 @@
+;; Samsung Exynos M1 pipeline description
+;; Copyright (C) 2014-2015 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_attr "exynos_m1_type"
+  "mla, mlal,
+   unknown"
+  (cond [
+	  (eq_attr "type" "mla, mlas, smlad, smladx,\
+			   smlawx, smlawy, smlaxy,\
+			   smlsd, smlsdx")
+	    (const_string "mla")
+
+	  (eq_attr "type" "smlal, smlals, smlald,\
+			   smlalxy, smlsld,\
+			   umaal, umlal, umlals")
+	    (const_string "mlal")]
+
+	  (const_string "unknown")))
+
+(define_attr "exynos_m1_neon_type"
+  "neon_arith_simple, neon_arith_basic, neon_arith_complex,
+   neon_multiply, neon_mla, neon_mla_q, neon_mla_long, neon_sat_mla_long,
+   neon_shift_acc, neon_shift_imm_basic, neon_shift_imm_complex,
+   neon_shift_reg_basic, neon_shift_reg_basic_q,
+   neon_shift_reg_complex, neon_shift_reg_complex_q,
+   neon_fp_unary, neon_fp_add, neon_fp_abd, neon_fp_compare,
+   neon_fp_reduc_minmax, neon_fp_reduc_add, neon_fp_round, neon_fp_cvt,
+   neon_fp_minmax, neon_fp_mul, neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q,
+   neon_fp_estimate, neon_fp_estimatex, neon_fp_step,
+   neon_bitops, neon_bitops_q, neon_bitins,
+   neon_to_gp, neon_from_gp, neon_move, neon_tbl,
+   neon_load1_1, neon_load1_2, neon_load1_3, neon_load1_4,
+   neon_load1_one, neon_load1_all,
+   neon_load2_2, neon_load2_one, neon_load2_all,
+   neon_load3_3, neon_load3_one, neon_load3_all,
+   neon_load4_4, neon_load4_one, neon_load4_all,
+   neon_store,
+   neon_store1_1, neon_store1_2, neon_store1_3, neon_store1_4, neon_store1_one,
+   neon_store2_2, neon_store2_one,
+   neon_store3_3, neon_store3_one,
+   neon_store4_4, neon_store4_one,
+   unknown"
+  (cond [
+	  (eq_attr "type" "neon_abd, neon_abd_q, neon_abd_long,\
+			   neon_abs, neon_abs_q,\
+			   neon_minmax, neon_minmax_q")
+	    (const_string "neon_arith_simple")
+
+	  (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
+			   neon_neg, neon_neg_q,\
+			   neon_sub, neon_sub_q, neon_sub_long, neon_sub_widen,\
+			   neon_logic, neon_logic_q, neon_tst, neon_tst_q,\
+			   neon_compare_zero, neon_compare_zero_q")
+	    (const_string "neon_arith_basic")
+
+	  (eq_attr "type" "neon_add_widen, neon_arith_acc, neon_arith_acc_q,\
+			   neon_reduc_add, neon_reduc_add_q,\
+			   neon_reduc_add_acc, neon_reduc_add_acc_q,\
+			   neon_reduc_add_long, neon_add_halve_narrow_q,\
+			   neon_add_halve, neon_add_halve_q,\
+			   neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+			   neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+			   neon_qneg_q, neon_qsub, neon_qsub_q,\
+			   neon_sub_halve_narrow_q,\
+			   neon_compare, neon_compare_q,\
+			   neon_reduc_minmax, neon_reduc_minmax_q")
+	    (const_string "neon_arith_complex")
+
+	  (eq_attr "type" "neon_mul_b, neon_mul_b_q, neon_mul_h, neon_mul_h_q,\
+			   neon_mul_s, neon_mul_s_q,\
+			   neon_mul_h_scalar, neon_mul_h_scalar_q,\
+			   neon_mul_s_scalar, neon_mul_s_scalar_q,\
+			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+			   neon_sat_mul_b, neon_sat_mul_b_q,\
+			   neon_sat_mul_h, neon_sat_mul_h_q,\
+			   neon_sat_mul_s, neon_sat_mul_s_q,\
+			   neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\
+			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
+			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
+			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+			   neon_sat_mul_s_scalar_long")
+	    (const_string "neon_multiply")
+
+	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
+			   neon_mla_h_scalar, neon_mla_s_scalar,\
+			   neon_mla_b_long, neon_mla_h_long,\
+			   neon_mla_s_long,\
+			   neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+			   neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+			   neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+	    (const_string "neon_mla")
+
+	  (eq_attr "type" "neon_sat_mla_b_long, neon_sat_mla_h_long,\
+			   neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+			   neon_sat_mla_s_scalar_long")
+	    (const_string "neon_sat_mla_long")
+
+	  (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+	    (const_string "neon_shift_acc")
+
+	  (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+			   neon_shift_imm_narrow_q, neon_shift_imm_long")
+	    (const_string "neon_shift_imm_basic")
+
+	  (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+			   neon_sat_shift_imm_narrow_q")
+	    (const_string "neon_shift_imm_complex")
+
+	  (eq_attr "type" "neon_shift_reg, neon_shift_reg_q")
+	    (const_string "neon_shift_reg_basic")
+
+	  (eq_attr "type" "neon_sat_shift_reg, neon_sat_shift_reg_q")
+	    (const_string "neon_shift_reg_complex")
+
+	  (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+			   neon_fp_abs_s, neon_fp_abs_s_q,\
+			   neon_fp_neg_d, neon_fp_neg_d_q,\
+			   neon_fp_abs_d, neon_fp_abs_d_q")
+	    (const_string "neon_fp_unary")
+
+	  (eq_attr "type" "neon_fp_addsub_s, neon_fp_addsub_s_q,\
+			   neon_fp_addsub_d, neon_fp_addsub_d_q")
+	    (const_string "neon_fp_add")
+
+	  (eq_attr "type" "neon_fp_abd_s, neon_fp_abd_s_q,\
+			   neon_fp_abd_d, neon_fp_abd_d_q")
+	    (const_string "neon_fp_abd")
+
+	  (eq_attr "type" "neon_fp_compare_s, neon_fp_compare_s_q,\
+			   neon_fp_compare_d, neon_fp_compare_d_q,\
+			   neon_fp_minmax_s, neon_fp_minmax_s_q,\
+			   neon_fp_minmax_d, neon_fp_minmax_d_q")
+	    (const_string "neon_fp_compare")
+
+	  (eq_attr "type" "neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q,\
+			   neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q")
+	    (const_string "neon_fp_reduc_minmax")
+
+	  (eq_attr "type" "neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+			   neon_fp_reduc_add_d, neon_fp_reduc_add_d_q")
+	    (const_string "neon_fp_reduc_add")
+
+	  (eq_attr "type" "neon_fp_round_s, neon_fp_round_s_q,\
+			   neon_fp_round_d, neon_fp_round_d_q")
+	    (const_string "neon_fp_round")
+
+	  (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h,
+			   neon_fp_to_int_s, neon_fp_to_int_s_q,\
+			   neon_fp_to_int_d_q, neon_fp_to_int_d,\
+			   neon_int_to_fp_s, neon_int_to_fp_s_q,\
+			   neon_int_to_fp_d, neon_int_to_fp_d_q")
+	    (const_string "neon_fp_cvt")
+
+	  (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_q,\
+			   neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q,\
+			   neon_fp_mul_d, neon_fp_mul_d_q,\
+			   neon_fp_mul_d_scalar_q")
+	    (const_string "neon_fp_mul")
+
+	  (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_q,\
+			   neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
+			   neon_fp_mla_d, neon_fp_mla_d_q,\
+			   neon_fp_mla_d_scalar_q")
+	    (const_string "neon_fp_mla")
+
+	  (eq_attr "type" "neon_fp_recpe_s, neon_fp_recpe_s_q,\
+			   neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			   neon_fp_recpe_d, neon_fp_recpe_d_q,\
+			   neon_fp_rsqrte_d, neon_fp_rsqrte_d_q")
+	    (const_string "neon_fp_estimate")
+
+	  (eq_attr "type" "neon_fp_recpx_s, neon_fp_recpx_s_q,\
+			   neon_fp_recpx_d, neon_fp_recpx_d_q")
+	    (const_string "neon_fp_estimatex")
+
+	  (eq_attr "type" "neon_fp_recps_s, neon_fp_recps_s_q,\
+			   neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			   neon_fp_recps_d, neon_fp_recps_d_q,\
+			   neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")
+	    (const_string "neon_fp_step")
+
+	  (eq_attr "type" "neon_rbit, neon_rbit_q,\
+			   neon_cls, neon_cls_q, neon_cnt, neon_cnt_q,\
+			   neon_dup, neon_dup_q,\
+			   neon_rev, neon_rev_q,\
+			   neon_move, neon_move_q,
+			   neon_ext, neon_permute, neon_zip")
+	    (const_string "neon_bitops")
+
+	  (eq_attr "type" "neon_ext_q, neon_permute_q, neon_zip_q")
+	    (const_string "neon_bitops_q")
+
+	  (eq_attr "type" "neon_bsl, neon_bsl_q")
+	    (const_string "neon_bitins")
+
+	  (eq_attr "type" "neon_tbl1, neon_tbl2, neon_tbl3, neon_tbl4")
+	    (const_string "neon_tbl")
+
+	  (eq_attr "type" "neon_from_gp, neon_from_gp_q, f_mcr, f_mcrr")
+	    (const_string "neon_from_gp")
+
+	  (eq_attr "type" "neon_to_gp, neon_to_gp_q, f_mrc, f_mrrc")
+	    (const_string "neon_to_gp")
+
+	  (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q")
+	    (const_string "neon_load1_1")
+
+	  (eq_attr "type" "neon_load1_2reg, neon_load1_2reg_q")
+	    (const_string "neon_load1_2")
+
+	  (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q")
+	    (const_string "neon_load1_3")
+
+	  (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q")
+	    (const_string "neon_load1_4")
+
+	  (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q")
+	    (const_string "neon_load1_one")
+
+	  (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q")
+	    (const_string "neon_load1_all")
+
+	  (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q,\
+			   neon_load2_4reg, neon_load2_4reg_q")
+	    (const_string "neon_load2_2")
+
+	  (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q")
+	    (const_string "neon_load2_one")
+
+	  (eq_attr "type" "neon_load2_all_lanes, neon_load2_all_lanes_q")
+	    (const_string "neon_load2_all")
+
+	  (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q")
+	    (const_string "neon_load3_3")
+
+	  (eq_attr "type" "neon_load3_one_lane, neon_load3_one_lane_q")
+	    (const_string "neon_load3_one")
+
+	  (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q")
+	    (const_string "neon_load3_all")
+
+	  (eq_attr "type" "neon_load4_4reg, neon_load4_4reg_q")
+	    (const_string "neon_load4_4")
+
+	  (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+	    (const_string "neon_load4_one")
+
+	  (eq_attr "type" "neon_load4_all_lanes, neon_load4_all_lanes_q")
+	    (const_string "neon_load4_all")
+
+	  (eq_attr "type" "f_stores, f_stored,\
+			   neon_stp, neon_stp_q")
+	    (const_string "neon_store")
+
+	  (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q")
+	    (const_string "neon_store1_1")
+
+	  (eq_attr "type" "neon_store1_2reg, neon_store1_2reg_q")
+	    (const_string "neon_store1_2")
+
+	  (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q")
+	    (const_string "neon_store1_3")
+
+	  (eq_attr "type" "neon_store1_4reg, neon_store1_4reg_q")
+	    (const_string "neon_store1_4")
+
+	  (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q")
+	    (const_string "neon_store1_one")
+
+	  (eq_attr "type" "neon_store2_2reg, neon_store2_2reg_q,\
+			   neon_store2_4reg, neon_store2_4reg_q")
+	    (const_string "neon_store2_2")
+
+	  (eq_attr "type" "neon_store2_one_lane, neon_store2_one_lane_q")
+	    (const_string "neon_store2_one")
+
+	  (eq_attr "type" "neon_store3_3reg, neon_store3_3reg_q")
+	    (const_string "neon_store3_3")
+
+	  (eq_attr "type" "neon_store3_one_lane, neon_store3_one_lane_q")
+	    (const_string "neon_store3_one")
+
+	  (eq_attr "type" "neon_store4_4reg, neon_store4_4reg_q")
+	    (const_string "neon_store4_4")
+
+	  (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q")
+	    (const_string "neon_store4_one")]
+
+	  (const_string "unknown")))
+
+;; Redefine this attribute for when building the AArch64 backend.
+(define_attr "predicated" "yes,no" (const_string "no"))
+
+;; The Exynos M1 core is modeled as a triple issue pipeline that has
+;; the following functional units.
+
+(define_automaton "exynos_m1_gp")
+(define_automaton "exynos_m1_ls")
+(define_automaton "exynos_m1_fp")
+
+;; 1.  Two pipelines for simple integer operations: A, B
+;; 2.  One pipeline for simple or complex integer operations: C
+
+(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
+
+(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
+(define_reservation "em1_c" "em1_xc")
+
+;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
+
+(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
+
+(define_reservation "em1_fmac" "em1_f0")
+(define_reservation "em1_fcvt" "em1_f0")
+(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
+(define_reservation "em1_nalu0" "em1_f0")
+(define_reservation "em1_nalu1" "em1_f1")
+(define_reservation "em1_nmisc" "em1_f0")
+(define_reservation "em1_ncrypt" "em1_f0")
+(define_reservation "em1_fadd" "em1_f1")
+(define_reservation "em1_fvar" "em1_f1")
+(define_reservation "em1_fst" "em1_f1")
+
+;; 4.  One pipeline for branch operations: BX
+
+(define_cpu_unit "em1_bx" "exynos_m1_gp")
+
+(define_reservation "em1_br" "em1_bx")
+
+;; 5.  One AGU for loads: L
+;;     One AGU for stores and one pipeline for stores: S, SD
+
+(define_cpu_unit "em1_lx" "exynos_m1_ls")
+(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
+
+(define_reservation "em1_ld" "em1_lx")
+(define_reservation "em1_st" "(em1_sx + em1_sd)")
+
+;; Common occurrences
+(define_reservation "em1_sfst" "(em1_fst + em1_st)")
+(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
+
+;; Branches
+;;
+;; No latency as there is no result
+;; TODO: Unconditional branches use no units;
+;; conditional branches add the BX unit;
+;; indirect branches add the C unit.
+(define_insn_reservation "exynos_m1_branch" 0
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "branch"))
+  "em1_br")
+
+(define_insn_reservation "exynos_m1_call" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "call"))
+  "em1_alu")
+
+;; Basic ALU
+;;
+;; Simple ALU without shift, non-predicated
+(define_insn_reservation "exynos_m1_alu" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_alu")
+
+;; Simple ALU without shift, predicated
+(define_insn_reservation "exynos_m1_alu_p" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_c")
+
+;; ALU ops with immediate shift
+;; TODO: if the shift value is between 0 and 3, the latency is just 1 cycle;
+;;       otherwise it takes 2 cycles and the unit is blocked;
+;;       for now, assume the latter's latency and the former's units.
+(define_insn_reservation "exynos_m1_alu_shift" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "alu_ext, alus_ext,\
+			alu_shift_imm, alus_shift_imm,\
+			logic_shift_imm, logics_shift_imm,\
+			mov_shift, mvn_shift"))
+  "(em1_alu)")
+
+;; ALU ops with register controlled shift, non-predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu * 2)")
+
+;; ALU ops with register controlled shift, predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg_p" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+  "(em1_alu, em1_c)")
+
+;; Integer multiply
+(define_insn_reservation "exynos_m1_mul" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "mul"))
+  "em1_c")
+
+;; Integer multiply-accumulate
+;; TODO: tell 32 from 64-bit ones
+(define_insn_reservation "exynos_m1_mla" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mla"))
+  "em1_c")
+
+(define_insn_reservation "exynos_m1_mlal" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mlal"))
+  "em1_alu, em1_c")
+
+;; Integer divide
+;; TODO: assume the median latency; blocks other divisions
+(define_insn_reservation "exynos_m1_div" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "udiv, sdiv"))
+  "em1_c")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to 2 words.
+(define_insn_reservation "exynos_m1_load" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load_byte, load1, load2"))
+  "em1_ld")
+
+;; Loads of 3 or 4 words.
+(define_insn_reservation "exynos_m1_loadm" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load3, load4"))
+  "(em1_ld * 3)")
+
+;; Stores of up to 2 words.
+(define_insn_reservation "exynos_m1_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store1, store2"))
+  "em1_st")
+
+;; Stores of 3 or 4 words.
+(define_insn_reservation "exynos_m1_storem" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store3, store4"))
+  "(em1_st * 3)")
+
+;; Advanced SIMD Unit
+;;
+;; Integer Arithmetic Instructions.
+
+(define_insn_reservation  "exynos_m1_arith_simple" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_simple"))
+  "em1_nmisc")
+
+(define_insn_reservation  "exynos_m1_neon_arith_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_basic"))
+  "em1_nalu")
+
+(define_insn_reservation  "exynos_m1_neon_arith_complex" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_complex"))
+  "em1_nmisc")
+
+;; Integer Multiply Instructions.
+
+(define_insn_reservation "exynos_m1_neon_multiply" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_multiply, neon_mla, neon_sat_mla_long"))
+  "em1_nmisc")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_acc" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_shift_acc"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_basic, neon_shift_reg_basic"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_complex" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_complex, neon_shift_reg_complex"))
+  "em1_nalu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_unary" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_unary"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_add"))
+  "em1_fadd")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_abd" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_abd"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_compare" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_compare"))
+  "em1_nmisc")
+
+;; TODO: the latency and throughput of reduce insns actually varies between
+;; 3-5 and 1/4-1, but picked the median values.
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_minmax"))
+  "(em1_nmisc * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc_add" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_add"))
+  "((em1_nalu * 2), em1_fadd)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_round" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_round"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_cvt"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mul"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mla" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mla"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimate" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimate"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimatex" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimatex"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_step" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_step"))
+  "em1_fmac")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops_q" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops_q"))
+  "(em1_nalu, em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitins" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitins"))
+  "em1_nalu1")
+
+;; TODO: it is more complicated than this.
+(define_insn_reservation
+  "exynos_m1_neon_tbl" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_tbl"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_from_gp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_from_gp"))
+  "em1_st")
+
+(define_insn_reservation
+  "exynos_m1_neon_to_gp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_to_gp"))
+  "em1_lfst")
+
+;; Load Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_load" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_loads, f_loadd, neon_ldp"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load_q" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_ldp_q"))
+  "(em1_ld, em1_ld)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_1" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_1, neon_load1_all"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_2" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_2"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_3" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_3"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_4" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_4"))
+  "(em1_ld * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_one"))
+  "((em1_ld * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_2" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_2"))
+  "(em1_ld * 5)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_one"))
+  "((em1_ld * 2), (em1_nalu * 2))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_all" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_all"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_3" 12
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_3"))
+  "(em1_ld * 6)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_one"))
+  "((em1_ld * 4), (em1_nalu * 3))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_all" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_all"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_4" 14
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_4"))
+  "(em1_ld * 7)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_one"))
+  "((em1_ld * 4), (em1_nalu * 4))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_all" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_all"))
+  "(em1_ld * 4)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_1" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_1"))
+  "em1_sfst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_2" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_2"))
+  "(em1_sfst * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_3" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_3"))
+  "(em1_sfst * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_4" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_4"))
+  "(em1_sfst * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store2" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store2_2, neon_store2_one"))
+  "em1_sfst, em1_fst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store3" 16
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store3_3, neon_store3_one"))
+  "((em1_sfst * 3), (em1_fst * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store4" 17
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store4_4, neon_store4_one"))
+  "((em1_sfst * 4), (em1_fst * 2), em1_nalu)")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "exynos_m1_fp_const" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fconsts, fconstd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fadds, faddd"))
+  "em1_fadd")
+
+(define_insn_reservation "exynos_m1_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmuls, fmuld"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_mac" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmacs, ffmas, fmacd, ffmad"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvt, f_rints, f_rintd"))
+  "em1_fcvt")
+
+(define_insn_reservation "exynos_m1_fp_cvt_i" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvtf2i"))
+  "(em1_fcvt, em1_lfst)")
+
+(define_insn_reservation "exynos_m1_i_cvt_fp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvti2f"))
+  "(em1_st, em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_fp_cmp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcmps, fcmpd"))
+  "em1_nmisc")
+
+(define_insn_reservation "exynos_m1_fp_sel" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcsel"))
+  "(em1_st + em1_nalu0)")
+
+(define_insn_reservation "exynos_m1_fp_arith" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "ffariths, ffarithd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_cpy" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmov"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_divs" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivs, neon_fp_div_s, neon_fp_div_s_q,\
+			fsqrts, neon_fp_sqrt_s, neon_fp_sqrt_s_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_divd" 22
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivd, neon_fp_div_d, neon_fp_div_d_q,\
+			fsqrtd, neon_fp_sqrt_d, neon_fp_sqrt_d_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_minmax" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_minmaxs, f_minmaxd"))
+  "(em1_nmisc * 2)")
+
+;; Crypto Operations.
+
+(define_insn_reservation "exynos_m1_crypto_simple" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_aese, crypto_aesmc,\
+			crypto_sha1_xor, crypto_sha1_fast, crypto_sha256_fast"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_complex" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_poly" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_b_long, neon_mul_h_long, neon_mul_s_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_polyl" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_d_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crc" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crc"))
+  "em1_c")
+
+;; Simple execution unit bypasses
+
+;; Pre-decrement and post-increment addressing modes update the register quickly.
+;; TODO: figure out how to tell the addressing mode register from the loaded one.
+(define_bypass 1 "exynos_m1_store*" "exynos_m1_store*")
+
+;; MLAs can feed other MLAs quickly.
+(define_bypass 1 "exynos_m1_mla*" "exynos_m1_mla")
+(define_bypass 4 "exynos_m1_mla*" "exynos_m1_mlal")
+
+;; Insns in FMAC or FADD can feed other such insns quickly.
+(define_bypass 4 "exynos_m1_fp_mul"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 5 "exynos_m1_fp_mac"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 4 "exynos_m1_neon_fp_mul"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+(define_bypass 5 "exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+(define_bypass 3 "exynos_m1_fp_add"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 3 "exynos_m1_neon_fp_add"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+;; Insns in NALU can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 3 "exynos_m1_fp_sel"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 1 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_bitops, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 3 "exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 1 "exynos_m1_neon_fp_unary" "exynos_m1_neon_fp_unary")
+
+;; Insns in NCRYPT can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_crypto_simple, exynos_m1_crypto_poly"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 3 "exynos_m1_crypto_polyl"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 5 "exynos_m1_crypto_complex"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+
+;; Predicted branches take no time, but mispredicted ones take forever anyway.
+(define_bypass 1 "exynos_m1_*"
+		 "exynos_m1_call, exynos_m1_branch")

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/4][AArch64] Add scheduling and cost models for Exynos M1
  2015-11-04 23:18   ` [PATCH 1/4][AArch64] " Evandro Menezes
@ 2015-11-04 23:21     ` Evandro Menezes
  2015-11-05  9:22       ` James Greenhalgh
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-04 23:21 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 799 bytes --]

Please, ignore the previous patch.  This is the intended patch.

Sorry.

-- 
Evandro Menezes

On 11/04/2015 05:18 PM, Evandro Menezes wrote:
> This patch adds extra tuning information about AArch64 targets:
>
>  * Maximum number of case values before resorting to a jump table
>    The default values assumed independently of the specific backends
>    may be rather low for modern processors, which sport quite efficient
>    direct branch prediction, whereas indirect branch prediction is
>    still typically not so efficient.  This value may be specifically
>    set for a processor or left at zero to use the default values.
>  * L1 cache line size
>    The auto-prefetcher uses this information when emitting software
>    prefetch insns.
>
> Please, commit if it's alright.
>
> Thank you,
>


[-- Attachment #2: target.patch --]
[-- Type: text/x-patch, Size: 4613 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 81792bc..ecf4685 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -195,6 +195,9 @@ struct tune_params
   int vec_reassoc_width;
   int min_div_recip_mul_sf;
   int min_div_recip_mul_df;
+  int max_case_values; /* Case values threshold; or 0 for the default.  */
+
+  int cache_line_size; /* Cache line size; or 0 for the default.  */
 
 /* An enum specifying how to take into account CPU autoprefetch capabilities
    during instruction scheduling:
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5c8604f..e7f1c07 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -355,6 +355,8 @@ static const struct tune_params generic_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -378,6 +380,8 @@ static const struct tune_params cortexa53_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -401,6 +405,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
 };
@@ -424,6 +430,8 @@ static const struct tune_params cortexa72_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -446,6 +454,8 @@ static const struct tune_params thunderx_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -468,6 +478,8 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -3242,6 +3254,20 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
   return aarch64_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+
+static unsigned int
+aarch64_case_values_threshold (void)
+{
+  /* Use the specified limit for the number of cases before using jump
+     tables at higher optimization levels.  */
+  if (optimize > 2
+      && selected_cpu->tune->max_case_values != 0)
+    return selected_cpu->tune->max_case_values;
+  else
+    return default_case_values_threshold ();
+}
+
 /* Return true if register REGNO is a valid index register.
    STRICT_P is true if REG_OK_STRICT is in effect.  */
 
@@ -7672,6 +7698,13 @@ aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Set the L1 cache line size.  */
+  if (selected_cpu->tune->cache_line_size != 0)
+    maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			   selected_cpu->tune->cache_line_size,
+			   opts->x_param_values,
+			   global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13385,6 +13418,7 @@ aarch64_promoted_type (const_tree t)
     return float_type_node;
   return NULL_TREE;
 }
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -13432,6 +13466,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_CANNOT_FORCE_CONST_MEM
 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
 
+#undef TARGET_CASE_VALUES_THRESHOLD
+#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
+
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/4][AArch64] Add scheduling and cost models for Exynos M1
  2015-11-04 23:21     ` Evandro Menezes
@ 2015-11-05  9:22       ` James Greenhalgh
  2015-11-05 17:31         ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: James Greenhalgh @ 2015-11-05  9:22 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On Wed, Nov 04, 2015 at 05:21:03PM -0600, Evandro Menezes wrote:
> Please, ignore the previous patch.  This is the intended patch.
> 
> Sorry.
> 
> -- 
> Evandro Menezes
> 
> On 11/04/2015 05:18 PM, Evandro Menezes wrote:
> >This patch adds extra tuning information about AArch64 targets:
> >
> > * Maximum number of case values before resorting to a jump table
> >   The default values assumed independently of the specific backends
> >   may be rather low for modern processors, which sport quite efficient
> >   direct branch prediction, whereas indirect branch prediction is
> >   still typically not so efficient.  This value may be specifically
> >   set for a processor or left at zero to use the default values.
> > * L1 cache line size
> >   The auto-prefetcher uses this information when emitting software
> >   prefetch insns.
> >
> >Please, commit if it's alright.
> >
> >Thank you,
> >
> 

Thanks for the patch,

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 81792bc..ecf4685 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -195,6 +195,9 @@ struct tune_params
>    int vec_reassoc_width;
>    int min_div_recip_mul_sf;
>    int min_div_recip_mul_df;
> +  int max_case_values; /* Case values threshold; or 0 for the default.  */

If we're using an int, how about -1 as the sentinel value? (Maybe someone
really likes jump tables!). Otherwise, make this an unsigned int?

> +
> +  int cache_line_size; /* Cache line size; or 0 for the default.  */

unsigned int?

The patch is otherwise OK, though it needs a ChangeLog.

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/4][AArch64] Add scheduling and cost models for Exynos M1
  2015-11-05  9:22       ` James Greenhalgh
@ 2015-11-05 17:31         ` Evandro Menezes
  2015-11-12 14:47           ` James Greenhalgh
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-05 17:31 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 2567 bytes --]

James,

Since other members of the "tune_params" structure were signed integers, 
even though negative numbers would make no sense for most either, I 
followed the same pattern.

Regardless, here's a patch with unsigned integers as you requested:

    [AArch64] Add extra tuning parameters for target processors

    2015-11-05  Evandro Menezes  <e.menezes@samsung.com>

    gcc/

        * config/aarch64/aarch64-protos.h (tune_params): Add new members
        "max_case_values" and "cache_line_size".
        * config/aarch64/aarch64.c (aarch64_case_values_threshold): New
        function.
        (aarch64_override_options_internal): Tune heuristics based on new
        members in "tune_params".
        (TARGET_CASE_VALUES_THRESHOLD): Define macro.

Please, commit if it's alright.

Thank you,

-- 
Evandro Menezes

On 11/05/2015 03:22 AM, James Greenhalgh wrote:
> On Wed, Nov 04, 2015 at 05:21:03PM -0600, Evandro Menezes wrote:
>> Please, ignore the previous patch.  This is the intended patch.
>>
>> Sorry.
>>
>> -- 
>> Evandro Menezes
>>
>> On 11/04/2015 05:18 PM, Evandro Menezes wrote:
>>> This patch adds extra tuning information about AArch64 targets:
>>>
>>> * Maximum number of case values before resorting to a jump table
>>>    The default values assumed independently of the specific backends
>>>    may be rather low for modern processors, which sport quite efficient
>>>    direct branch prediction, whereas indirect branch prediction is
>>>    still typically not so efficient.  This value may be specifically
>>>    set for a processor or left at zero to use the default values.
>>> * L1 cache line size
>>>    The auto-prefetcher uses this information when emitting software
>>>    prefetch insns.
>>>
>>> Please, commit if it's alright.
>>>
>>> Thank you,
>>>
> Thanks for the patch,
>
>> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
>> index 81792bc..ecf4685 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -195,6 +195,9 @@ struct tune_params
>>     int vec_reassoc_width;
>>     int min_div_recip_mul_sf;
>>     int min_div_recip_mul_df;
>> +  int max_case_values; /* Case values threshold; or 0 for the default.  */
> If we're using an int, how about -1 as the sentinel value? (Maybe someone
> really likes jump tables!). Otherwise, make this an unsigned int?
>
>> +
>> +  int cache_line_size; /* Cache line size; or 0 for the default.  */
> unsigned int?
>
> The patch is otherwise OK, though it needs a ChangeLog.
>
> Thanks,
> James
>
>


[-- Attachment #2: target.patch --]
[-- Type: text/x-patch, Size: 4631 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 81792bc..ecf4685 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -195,6 +195,9 @@ struct tune_params
   int vec_reassoc_width;
   int min_div_recip_mul_sf;
   int min_div_recip_mul_df;
+  unsigned int max_case_values; /* Case values threshold; or 0 for the default.  */
+
+  unsigned int cache_line_size; /* Cache line size; or 0 for the default.  */
 
 /* An enum specifying how to take into account CPU autoprefetch capabilities
    during instruction scheduling:
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5c8604f..e7f1c07 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -355,6 +355,8 @@ static const struct tune_params generic_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -378,6 +380,8 @@ static const struct tune_params cortexa53_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -401,6 +405,8 @@ static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
 };
@@ -424,6 +430,8 @@ static const struct tune_params cortexa72_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -446,6 +454,8 @@ static const struct tune_params thunderx_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -468,6 +478,8 @@ static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -3242,6 +3254,20 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
   return aarch64_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+
+static unsigned int
+aarch64_case_values_threshold (void)
+{
+  /* Use the specified limit for the number of cases before using jump
+     tables at higher optimization levels.  */
+  if (optimize > 2
+      && selected_cpu->tune->max_case_values != 0)
+    return selected_cpu->tune->max_case_values;
+  else
+    return default_case_values_threshold ();
+}
+
 /* Return true if register REGNO is a valid index register.
    STRICT_P is true if REG_OK_STRICT is in effect.  */
 
@@ -7672,6 +7698,13 @@ aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Set the L1 cache line size.  */
+  if (selected_cpu->tune->cache_line_size != 0)
+    maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			   selected_cpu->tune->cache_line_size,
+			   opts->x_param_values,
+			   global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13385,6 +13418,7 @@ aarch64_promoted_type (const_tree t)
     return float_type_node;
   return NULL_TREE;
 }
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -13432,6 +13466,9 @@ aarch64_promoted_type (const_tree t)
 #undef TARGET_CANNOT_FORCE_CONST_MEM
 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
 
+#undef TARGET_CASE_VALUES_THRESHOLD
+#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
+
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
  2015-11-04 23:18   ` [PATCH 1/4][AArch64] " Evandro Menezes
@ 2015-11-05 20:51   ` Evandro Menezes
  2015-11-19 22:04     ` Evandro Menezes
  2015-11-05 23:30   ` [PATCH 3/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-05 20:51 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 588 bytes --]

    2015-11-05  Evandro Menezes <e.menezes@samsung.com>

    gcc/

        * config/aarch64/aarch64.c (aarch64_override_options_internal):
        Increase loop peeling limit.

This patch increases the limit for the number of peeled insns.  With 
this change, I noticed no major regression in either Geekbench v3 or 
SPEC CPU2000 while some benchmarks, typically FP ones, improved 
significantly.

I tested this tuning on Exynos M1 and on A57.  ThunderX seems to benefit 
from this tuning too.  However, I'd appreciate comments from other 
stakeholders.

Thank you,

-- 
Evandro Menezes


[-- Attachment #2: peel.patch --]
[-- Type: text/x-patch, Size: 570 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5c8604f..66122e7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7672,6 +7672,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Increase the maximum peeling limit.  */
+  maybe_set_param_value (PARAM_MAX_COMPLETELY_PEELED_INSNS,
+			 400,
+			 opts->x_param_values,
+			 global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3/4][AArch64] Add scheduling model for Exynos M1
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
  2015-11-04 23:18   ` [PATCH 1/4][AArch64] " Evandro Menezes
  2015-11-05 20:51   ` [PATCH 2/4][AArch64] Increase the loop peeling limit Evandro Menezes
@ 2015-11-05 23:30   ` Evandro Menezes
  2015-11-09 23:06     ` Evandro Menezes
  2015-11-05 23:30   ` [PATCH 3/4][AArch64] " Evandro Menezes
  2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
  4 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-05 23:30 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 642 bytes --]

    2015-11-05  Evandro Menezes <e.menezes@samsung.com>

    gcc/
         * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
         * config/aarch64/aarch64.md: Include "exynos-m1.md".
         * config/arm/arm-cores.def: Use the Exynos M1 sched model.
         * config/arm/arm.md: Include "exynos-m1.md".
         * config/arm/arm-tune.md: Regenerated.
         * config/arm/exynos-m1.md: New file.

This patch adds the scheduling model for Exynos M1.  I split the DFA 
into ones for GP, LS and FP, resulting in many fewer states and arcs 
than before.

Please, commit if it's alright.

Thank you,

-- 
Evandro Menezes



[-- Attachment #2: 0001-AArch64-Add-scheduling-model-for-Exynos-M1.patch --]
[-- Type: text/x-patch, Size: 37148 bytes --]

From 6846ac01eb670a90b4e1335447ac3ecb60bbce77 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 5 Nov 2015 17:23:32 -0600
Subject: [PATCH] [AArch64] Add scheduling model for Exynos M1

2015-11-05  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
	* config/aarch64/aarch64.md: Include "exynos-m1.md".
	* config/arm/arm-cores.def: Use the Exynos M1 sched model.
	* config/arm/arm.md: Include "exynos-m1.md".
	* config/arm/exynos-m1.md: New file.
---
 gcc/config/aarch64/aarch64-cores.def |   2 +-
 gcc/config/aarch64/aarch64.md        |   1 +
 gcc/config/arm/arm-cores.def         |   2 +-
 gcc/config/arm/arm.md                |   3 +-
 gcc/config/arm/exynos-m1.md          | 974 +++++++++++++++++++++++++++++++++++
 5 files changed, 979 insertions(+), 3 deletions(-)
 create mode 100644 gcc/config/arm/exynos-m1.md

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 0ab1ca8..c17baa3 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6cdddf4..bd26f24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -202,6 +202,7 @@
 ;; Scheduling
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a57.md")
+(include "../arm/exynos-m1.md")
 (include "thunderx.md")
 (include "../arm/xgene1.md")
 
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 4c35200..3448e82 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 02e147e..e6f07e9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -377,7 +377,7 @@
                                 arm1136jfs,cortexa5,cortexa7,cortexa8,\
                                 cortexa9,cortexa12,cortexa15,cortexa17,\
                                 cortexa53,cortexa57,cortexm4,cortexm7,\
-				marvell_pj4,xgene1")
+				exynosm1,marvell_pj4,xgene1")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -416,6 +416,7 @@
 (include "cortex-m7.md")
 (include "cortex-m4.md")
 (include "cortex-m4-fpu.md")
+(include "exynos-m1.md")
 (include "vfp11.md")
 (include "marvell-pj4.md")
 (include "xgene1.md")
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
new file mode 100644
index 0000000..a4f1138
--- /dev/null
+++ b/gcc/config/arm/exynos-m1.md
@@ -0,0 +1,974 @@
+;; Samsung Exynos M1 pipeline description
+;; Copyright (C) 2014-2015 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_attr "exynos_m1_type"
+  "mla, mlal,
+   unknown"
+  (cond [
+	  (eq_attr "type" "mla, mlas, smlad, smladx,\
+			   smlawx, smlawy, smlaxy,\
+			   smlsd, smlsdx")
+	    (const_string "mla")
+
+	  (eq_attr "type" "smlal, smlals, smlald,\
+			   smlalxy, smlsld,\
+			   umaal, umlal, umlals")
+	    (const_string "mlal")]
+
+	  (const_string "unknown")))
+
+(define_attr "exynos_m1_neon_type"
+  "neon_arith_simple, neon_arith_basic, neon_arith_complex,
+   neon_multiply, neon_mla, neon_mla_q, neon_mla_long, neon_sat_mla_long,
+   neon_shift_acc, neon_shift_imm_basic, neon_shift_imm_complex,
+   neon_shift_reg_basic, neon_shift_reg_basic_q,
+   neon_shift_reg_complex, neon_shift_reg_complex_q,
+   neon_fp_unary, neon_fp_add, neon_fp_abd, neon_fp_compare,
+   neon_fp_reduc_minmax, neon_fp_reduc_add, neon_fp_round, neon_fp_cvt,
+   neon_fp_minmax, neon_fp_mul, neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q,
+   neon_fp_estimate, neon_fp_estimatex, neon_fp_step,
+   neon_bitops, neon_bitops_q, neon_bitins,
+   neon_to_gp, neon_from_gp, neon_move, neon_tbl,
+   neon_load1_1, neon_load1_2, neon_load1_3, neon_load1_4,
+   neon_load1_one, neon_load1_all,
+   neon_load2_2, neon_load2_one, neon_load2_all,
+   neon_load3_3, neon_load3_one, neon_load3_all,
+   neon_load4_4, neon_load4_one, neon_load4_all,
+   neon_store,
+   neon_store1_1, neon_store1_2, neon_store1_3, neon_store1_4, neon_store1_one,
+   neon_store2_2, neon_store2_one,
+   neon_store3_3, neon_store3_one,
+   neon_store4_4, neon_store4_one,
+   unknown"
+  (cond [
+	  (eq_attr "type" "neon_abd, neon_abd_q, neon_abd_long,\
+			   neon_abs, neon_abs_q,\
+			   neon_minmax, neon_minmax_q")
+	    (const_string "neon_arith_simple")
+
+	  (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
+			   neon_neg, neon_neg_q,\
+			   neon_sub, neon_sub_q, neon_sub_long, neon_sub_widen,\
+			   neon_logic, neon_logic_q, neon_tst, neon_tst_q,\
+			   neon_compare_zero, neon_compare_zero_q")
+	    (const_string "neon_arith_basic")
+
+	  (eq_attr "type" "neon_add_widen, neon_arith_acc, neon_arith_acc_q,\
+			   neon_reduc_add, neon_reduc_add_q,\
+			   neon_reduc_add_acc, neon_reduc_add_acc_q,\
+			   neon_reduc_add_long, neon_add_halve_narrow_q,\
+			   neon_add_halve, neon_add_halve_q,\
+			   neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+			   neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+			   neon_qneg_q, neon_qsub, neon_qsub_q,\
+			   neon_sub_halve_narrow_q,\
+			   neon_compare, neon_compare_q,\
+			   neon_reduc_minmax, neon_reduc_minmax_q")
+	    (const_string "neon_arith_complex")
+
+	  (eq_attr "type" "neon_mul_b, neon_mul_b_q, neon_mul_h, neon_mul_h_q,\
+			   neon_mul_s, neon_mul_s_q,\
+			   neon_mul_h_scalar, neon_mul_h_scalar_q,\
+			   neon_mul_s_scalar, neon_mul_s_scalar_q,\
+			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+			   neon_sat_mul_b, neon_sat_mul_b_q,\
+			   neon_sat_mul_h, neon_sat_mul_h_q,\
+			   neon_sat_mul_s, neon_sat_mul_s_q,\
+			   neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\
+			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
+			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
+			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+			   neon_sat_mul_s_scalar_long")
+	    (const_string "neon_multiply")
+
+	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
+			   neon_mla_h_scalar, neon_mla_s_scalar,\
+			   neon_mla_b_long, neon_mla_h_long,\
+			   neon_mla_s_long,\
+			   neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+			   neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+			   neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+	    (const_string "neon_mla")
+
+	  (eq_attr "type" "neon_sat_mla_b_long, neon_sat_mla_h_long,\
+			   neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+			   neon_sat_mla_s_scalar_long")
+	    (const_string "neon_sat_mla_long")
+
+	  (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+	    (const_string "neon_shift_acc")
+
+	  (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+			   neon_shift_imm_narrow_q, neon_shift_imm_long")
+	    (const_string "neon_shift_imm_basic")
+
+	  (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+			   neon_sat_shift_imm_narrow_q")
+	    (const_string "neon_shift_imm_complex")
+
+	  (eq_attr "type" "neon_shift_reg, neon_shift_reg_q")
+	    (const_string "neon_shift_reg_basic")
+
+	  (eq_attr "type" "neon_sat_shift_reg, neon_sat_shift_reg_q")
+	    (const_string "neon_shift_reg_complex")
+
+	  (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+			   neon_fp_abs_s, neon_fp_abs_s_q,\
+			   neon_fp_neg_d, neon_fp_neg_d_q,\
+			   neon_fp_abs_d, neon_fp_abs_d_q")
+	    (const_string "neon_fp_unary")
+
+	  (eq_attr "type" "neon_fp_addsub_s, neon_fp_addsub_s_q,\
+			   neon_fp_addsub_d, neon_fp_addsub_d_q")
+	    (const_string "neon_fp_add")
+
+	  (eq_attr "type" "neon_fp_abd_s, neon_fp_abd_s_q,\
+			   neon_fp_abd_d, neon_fp_abd_d_q")
+	    (const_string "neon_fp_abd")
+
+	  (eq_attr "type" "neon_fp_compare_s, neon_fp_compare_s_q,\
+			   neon_fp_compare_d, neon_fp_compare_d_q,\
+			   neon_fp_minmax_s, neon_fp_minmax_s_q,\
+			   neon_fp_minmax_d, neon_fp_minmax_d_q")
+	    (const_string "neon_fp_compare")
+
+	  (eq_attr "type" "neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q,\
+			   neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q")
+	    (const_string "neon_fp_reduc_minmax")
+
+	  (eq_attr "type" "neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+			   neon_fp_reduc_add_d, neon_fp_reduc_add_d_q")
+	    (const_string "neon_fp_reduc_add")
+
+	  (eq_attr "type" "neon_fp_round_s, neon_fp_round_s_q,\
+			   neon_fp_round_d, neon_fp_round_d_q")
+	    (const_string "neon_fp_round")
+
+	  (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h,
+			   neon_fp_to_int_s, neon_fp_to_int_s_q,\
+			   neon_fp_to_int_d_q, neon_fp_to_int_d,\
+			   neon_int_to_fp_s, neon_int_to_fp_s_q,\
+			   neon_int_to_fp_d, neon_int_to_fp_d_q")
+	    (const_string "neon_fp_cvt")
+
+	  (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_q,\
+			   neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q,\
+			   neon_fp_mul_d, neon_fp_mul_d_q,\
+			   neon_fp_mul_d_scalar_q")
+	    (const_string "neon_fp_mul")
+
+	  (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_q,\
+			   neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
+			   neon_fp_mla_d, neon_fp_mla_d_q,\
+			   neon_fp_mla_d_scalar_q")
+	    (const_string "neon_fp_mla")
+
+	  (eq_attr "type" "neon_fp_recpe_s, neon_fp_recpe_s_q,\
+			   neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			   neon_fp_recpe_d, neon_fp_recpe_d_q,\
+			   neon_fp_rsqrte_d, neon_fp_rsqrte_d_q")
+	    (const_string "neon_fp_estimate")
+
+	  (eq_attr "type" "neon_fp_recpx_s, neon_fp_recpx_s_q,\
+			   neon_fp_recpx_d, neon_fp_recpx_d_q")
+	    (const_string "neon_fp_estimatex")
+
+	  (eq_attr "type" "neon_fp_recps_s, neon_fp_recps_s_q,\
+			   neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			   neon_fp_recps_d, neon_fp_recps_d_q,\
+			   neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")
+	    (const_string "neon_fp_step")
+
+	  (eq_attr "type" "neon_rbit, neon_rbit_q,\
+			   neon_cls, neon_cls_q, neon_cnt, neon_cnt_q,\
+			   neon_dup, neon_dup_q,\
+			   neon_rev, neon_rev_q,\
+			   neon_move, neon_move_q,
+			   neon_ext, neon_permute, neon_zip")
+	    (const_string "neon_bitops")
+
+	  (eq_attr "type" "neon_ext_q, neon_permute_q, neon_zip_q")
+	    (const_string "neon_bitops_q")
+
+	  (eq_attr "type" "neon_bsl, neon_bsl_q")
+	    (const_string "neon_bitins")
+
+	  (eq_attr "type" "neon_tbl1, neon_tbl2, neon_tbl3, neon_tbl4")
+	    (const_string "neon_tbl")
+
+	  (eq_attr "type" "neon_from_gp, neon_from_gp_q, f_mcr, f_mcrr")
+	    (const_string "neon_from_gp")
+
+	  (eq_attr "type" "neon_to_gp, neon_to_gp_q, f_mrc, f_mrrc")
+	    (const_string "neon_to_gp")
+
+	  (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q")
+	    (const_string "neon_load1_1")
+
+	  (eq_attr "type" "neon_load1_2reg, neon_load1_2reg_q")
+	    (const_string "neon_load1_2")
+
+	  (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q")
+	    (const_string "neon_load1_3")
+
+	  (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q")
+	    (const_string "neon_load1_4")
+
+	  (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q")
+	    (const_string "neon_load1_one")
+
+	  (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q")
+	    (const_string "neon_load1_all")
+
+	  (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q,\
+			   neon_load2_4reg, neon_load2_4reg_q")
+	    (const_string "neon_load2_2")
+
+	  (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q")
+	    (const_string "neon_load2_one")
+
+	  (eq_attr "type" "neon_load2_all_lanes, neon_load2_all_lanes_q")
+	    (const_string "neon_load2_all")
+
+	  (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q")
+	    (const_string "neon_load3_3")
+
+	  (eq_attr "type" "neon_load3_one_lane, neon_load3_one_lane_q")
+	    (const_string "neon_load3_one")
+
+	  (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q")
+	    (const_string "neon_load3_all")
+
+	  (eq_attr "type" "neon_load4_4reg, neon_load4_4reg_q")
+	    (const_string "neon_load4_4")
+
+	  (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+	    (const_string "neon_load4_one")
+
+	  (eq_attr "type" "neon_load4_all_lanes, neon_load4_all_lanes_q")
+	    (const_string "neon_load4_all")
+
+	  (eq_attr "type" "f_stores, f_stored,\
+			   neon_stp, neon_stp_q")
+	    (const_string "neon_store")
+
+	  (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q")
+	    (const_string "neon_store1_1")
+
+	  (eq_attr "type" "neon_store1_2reg, neon_store1_2reg_q")
+	    (const_string "neon_store1_2")
+
+	  (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q")
+	    (const_string "neon_store1_3")
+
+	  (eq_attr "type" "neon_store1_4reg, neon_store1_4reg_q")
+	    (const_string "neon_store1_4")
+
+	  (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q")
+	    (const_string "neon_store1_one")
+
+	  (eq_attr "type" "neon_store2_2reg, neon_store2_2reg_q,\
+			   neon_store2_4reg, neon_store2_4reg_q")
+	    (const_string "neon_store2_2")
+
+	  (eq_attr "type" "neon_store2_one_lane, neon_store2_one_lane_q")
+	    (const_string "neon_store2_one")
+
+	  (eq_attr "type" "neon_store3_3reg, neon_store3_3reg_q")
+	    (const_string "neon_store3_3")
+
+	  (eq_attr "type" "neon_store3_one_lane, neon_store3_one_lane_q")
+	    (const_string "neon_store3_one")
+
+	  (eq_attr "type" "neon_store4_4reg, neon_store4_4reg_q")
+	    (const_string "neon_store4_4")
+
+	  (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q")
+	    (const_string "neon_store4_one")]
+
+	  (const_string "unknown")))
+
+;; Redefine this attribute for when building the AArch64 backend.
+(define_attr "predicated" "yes,no" (const_string "no"))
+
+;; The Exynos M1 core is modeled as a triple issue pipeline that has
+;; the following functional units.
+
+(define_automaton "exynos_m1_gp")
+(define_automaton "exynos_m1_ls")
+(define_automaton "exynos_m1_fp")
+
+;; 1.  Two pipelines for simple integer operations: A, B
+;; 2.  One pipeline for simple or complex integer operations: C
+
+(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
+
+(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
+(define_reservation "em1_c" "em1_xc")
+
+;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
+
+(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
+
+(define_reservation "em1_fmac" "em1_f0")
+(define_reservation "em1_fcvt" "em1_f0")
+(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
+(define_reservation "em1_nalu0" "em1_f0")
+(define_reservation "em1_nalu1" "em1_f1")
+(define_reservation "em1_nmisc" "em1_f0")
+(define_reservation "em1_ncrypt" "em1_f0")
+(define_reservation "em1_fadd" "em1_f1")
+(define_reservation "em1_fvar" "em1_f1")
+(define_reservation "em1_fst" "em1_f1")
+
+;; 4.  One pipeline for branch operations: BX
+
+(define_cpu_unit "em1_bx" "exynos_m1_gp")
+
+(define_reservation "em1_br" "em1_bx")
+
+;; 5.  One AGU for loads: L
+;;     One AGU for stores and one pipeline for stores: S, SD
+
+(define_cpu_unit "em1_lx" "exynos_m1_ls")
+(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
+
+(define_reservation "em1_ld" "em1_lx")
+(define_reservation "em1_st" "(em1_sx + em1_sd)")
+
+;; Common occurrences
+(define_reservation "em1_sfst" "(em1_fst + em1_st)")
+(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
+
+;; Branches
+;;
+;; No latency as there is no result
+;; TODO: Unconditional branches use no units;
+;; conditional branches add the BX unit;
+;; indirect branches add the C unit.
+(define_insn_reservation "exynos_m1_branch" 0
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "branch"))
+  "em1_br")
+
+(define_insn_reservation "exynos_m1_call" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "call"))
+  "em1_alu")
+
+;; Basic ALU
+;;
+;; Simple ALU without shift, non-predicated
+(define_insn_reservation "exynos_m1_alu" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_alu")
+
+;; Simple ALU without shift, predicated
+(define_insn_reservation "exynos_m1_alu_p" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_c")
+
+;; ALU ops with immediate shift
+;; TODO: if the shift value is between 0 and 3, the latency is just 1 cycle;
+;;       otherwise it takes 2 cycles and the unit is blocked;
+;;       for now, assume the latter's latency and the former's units.
+(define_insn_reservation "exynos_m1_alu_shift" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "alu_ext, alus_ext,\
+			alu_shift_imm, alus_shift_imm,\
+			logic_shift_imm, logics_shift_imm,\
+			mov_shift, mvn_shift"))
+  "(em1_alu)")
+
+;; ALU ops with register controlled shift, non-predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu * 2)")
+
+;; ALU ops with register controlled shift, predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg_p" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+  "(em1_alu, em1_c)")
+
+;; Integer multiply
+(define_insn_reservation "exynos_m1_mul" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "mul"))
+  "em1_c")
+
+;; Integer multiply-accumulate
+;; TODO: tell 32 from 64-bit ones
+(define_insn_reservation "exynos_m1_mla" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mla"))
+  "em1_c")
+
+(define_insn_reservation "exynos_m1_mlal" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mlal"))
+  "em1_alu, em1_c")
+
+;; Integer divide
+;; TODO: assume the median latency; blocks other divisions
+(define_insn_reservation "exynos_m1_div" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "udiv, sdiv"))
+  "em1_c")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to 2 words.
+(define_insn_reservation "exynos_m1_load" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load_byte, load1, load2"))
+  "em1_ld")
+
+;; Loads of 3 or 4 words.
+(define_insn_reservation "exynos_m1_loadm" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load3, load4"))
+  "(em1_ld * 3)")
+
+;; Stores of up to 2 words.
+(define_insn_reservation "exynos_m1_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store1, store2"))
+  "em1_st")
+
+;; Stores of 3 or 4 words.
+(define_insn_reservation "exynos_m1_storem" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store3, store4"))
+  "(em1_st * 3)")
+
+;; Advanced SIMD Unit
+;;
+;; Integer Arithmetic Instructions.
+
+(define_insn_reservation  "exynos_m1_arith_simple" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_simple"))
+  "em1_nmisc")
+
+(define_insn_reservation  "exynos_m1_neon_arith_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_basic"))
+  "em1_nalu")
+
+(define_insn_reservation  "exynos_m1_neon_arith_complex" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_complex"))
+  "em1_nmisc")
+
+;; Integer Multiply Instructions.
+
+(define_insn_reservation "exynos_m1_neon_multiply" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_multiply, neon_mla, neon_sat_mla_long"))
+  "em1_nmisc")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_acc" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_shift_acc"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_basic, neon_shift_reg_basic"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_complex" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_complex, neon_shift_reg_complex"))
+  "em1_nalu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_unary" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_unary"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_add"))
+  "em1_fadd")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_abd" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_abd"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_compare" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_compare"))
+  "em1_nmisc")
+
+;; TODO: the latency and throughput of reduce insns actually varies between
+;; 3-5 and 1/4-1, but picked the median values.
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_minmax"))
+  "(em1_nmisc * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc_add" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_add"))
+  "((em1_nalu * 2), em1_fadd)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_round" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_round"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_cvt"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mul"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mla" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mla"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimate" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimate"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimatex" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimatex"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_step" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_step"))
+  "em1_fmac")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops_q" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops_q"))
+  "(em1_nalu, em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitins" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitins"))
+  "em1_nalu1")
+
+;; TODO: it is more complicated than this.
+(define_insn_reservation
+  "exynos_m1_neon_tbl" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_tbl"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_from_gp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_from_gp"))
+  "em1_st")
+
+(define_insn_reservation
+  "exynos_m1_neon_to_gp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_to_gp"))
+  "em1_lfst")
+
+;; Load Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_load" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_loads, f_loadd, neon_ldp"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load_q" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_ldp_q"))
+  "(em1_ld, em1_ld)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_1" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_1, neon_load1_all"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_2" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_2"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_3" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_3"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_4" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_4"))
+  "(em1_ld * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_one"))
+  "((em1_ld * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_2" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_2"))
+  "(em1_ld * 5)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_one"))
+  "((em1_ld * 2), (em1_nalu * 2))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_all" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_all"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_3" 12
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_3"))
+  "(em1_ld * 6)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_one"))
+  "((em1_ld * 4), (em1_nalu * 3))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_all" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_all"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_4" 14
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_4"))
+  "(em1_ld * 7)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_one"))
+  "((em1_ld * 4), (em1_nalu * 4))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_all" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_all"))
+  "(em1_ld * 4)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_1" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_1"))
+  "em1_sfst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_2" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_2"))
+  "(em1_sfst * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_3" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_3"))
+  "(em1_sfst * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_4" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_4"))
+  "(em1_sfst * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store2" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store2_2, neon_store2_one"))
+  "em1_sfst, em1_fst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store3" 16
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store3_3, neon_store3_one"))
+  "((em1_sfst * 3), (em1_fst * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store4" 17
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store4_4, neon_store4_one"))
+  "((em1_sfst * 4), (em1_fst * 2), em1_nalu)")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "exynos_m1_fp_const" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fconsts, fconstd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fadds, faddd"))
+  "em1_fadd")
+
+(define_insn_reservation "exynos_m1_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmuls, fmuld"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_mac" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmacs, ffmas, fmacd, ffmad"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvt, f_rints, f_rintd"))
+  "em1_fcvt")
+
+(define_insn_reservation "exynos_m1_fp_cvt_i" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvtf2i"))
+  "(em1_fcvt, em1_lfst)")
+
+(define_insn_reservation "exynos_m1_i_cvt_fp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvti2f"))
+  "(em1_st, em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_fp_cmp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcmps, fcmpd"))
+  "em1_nmisc")
+
+(define_insn_reservation "exynos_m1_fp_sel" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcsel"))
+  "(em1_st + em1_nalu0)")
+
+(define_insn_reservation "exynos_m1_fp_arith" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "ffariths, ffarithd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_cpy" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmov"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_divs" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivs, neon_fp_div_s, neon_fp_div_s_q,\
+			fsqrts, neon_fp_sqrt_s, neon_fp_sqrt_s_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_divd" 22
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivd, neon_fp_div_d, neon_fp_div_d_q,\
+			fsqrtd, neon_fp_sqrt_d, neon_fp_sqrt_d_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_minmax" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_minmaxs, f_minmaxd"))
+  "(em1_nmisc * 2)")
+
+;; Crypto Operations.
+
+(define_insn_reservation "exynos_m1_crypto_simple" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_aese, crypto_aesmc,\
+			crypto_sha1_xor, crypto_sha1_fast, crypto_sha256_fast"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_complex" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_poly" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_b_long, neon_mul_h_long, neon_mul_s_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_polyl" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_d_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crc" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crc"))
+  "em1_c")
+
+;; Simple execution unit bypasses
+
+;; Pre-decrement and post-increment addressing modes update the register quickly.
+;; TODO: figure out how to tell the addressing mode register from the loaded one.
+(define_bypass 1 "exynos_m1_store*" "exynos_m1_store*")
+
+;; MLAs can feed other MLAs quickly.
+(define_bypass 1 "exynos_m1_mla*" "exynos_m1_mla")
+(define_bypass 4 "exynos_m1_mla*" "exynos_m1_mlal")
+
+;; Insns in FMAC or FADD can feed other such insns quickly.
+(define_bypass 4 "exynos_m1_fp_mul"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 5 "exynos_m1_fp_mac"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 4 "exynos_m1_neon_fp_mul"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+(define_bypass 5 "exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+(define_bypass 3 "exynos_m1_fp_add"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 3 "exynos_m1_neon_fp_add"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+;; Insns in NALU can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 3 "exynos_m1_fp_sel"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 1 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_bitops, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 3 "exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 1 "exynos_m1_neon_fp_unary" "exynos_m1_neon_fp_unary")
+
+;; Insns in NCRYPT can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_crypto_simple, exynos_m1_crypto_poly"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 3 "exynos_m1_crypto_polyl"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 5 "exynos_m1_crypto_complex"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+
+;; Predicted branches take no time, but mispredicted ones take forever anyway.
+(define_bypass 1 "exynos_m1_*"
+		 "exynos_m1_call, exynos_m1_branch")
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3/4][AArch64] Add scheduling model for Exynos M1
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
                     ` (2 preceding siblings ...)
  2015-11-05 23:30   ` [PATCH 3/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
@ 2015-11-05 23:30   ` Evandro Menezes
  2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
  4 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-05 23:30 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

    2015-11-05  Evandro Menezes <e.menezes@samsung.com>

    gcc/
         * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
         * config/aarch64/aarch64.md: Include "exynos-m1.md".
         * config/arm/arm-cores.def: Use the Exynos M1 sched model.
         * config/arm/arm.md: Include "exynos-m1.md".
         * config/arm/arm-tune.md: Regenerated.
         * config/arm/exynos-m1.md: New file.

This patch adds the scheduling model for Exynos M1.  I split the DFA 
into ones for GP, LS and FP, resulting in many fewer states and arcs 
than before.

Please, commit if it's alright.

Thank you,

-- 
Evandro Menezes


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 4/4][AArch64] Add cost model for Exynos M1
  2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
                     ` (3 preceding siblings ...)
  2015-11-05 23:30   ` [PATCH 3/4][AArch64] " Evandro Menezes
@ 2015-11-06  0:09   ` Evandro Menezes
  2015-11-19 22:06     ` Evandro Menezes
                       ` (2 more replies)
  4 siblings, 3 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-06  0:09 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 770 bytes --]

    2015-10-25  Evandro Menezes <e.menezes@samsung.com>

    gcc/

        * config/aarch64/aarch64-cores.def: Use the Exynos M1 cost model.
        * config/aarch64/aarch64.c (exynosm1_addrcost_table): New variable.
        (exynosm1_regmove_cost): Likewise.
        (exynosm1_vector_cost): Likewise.
        (exynosm1_tunings): Likewise.
        * config/arm/aarch-cost-tables.h (exynosm1_extra_costs): Likewise.
        * config/arm/arm.c (arm_exynos_m1_tune): Likewise.

This patch adds the cost model for Exynos M1.  This patch depends on a 
couple of previous patches though, 
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00505.html and 
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00538.html

Please, commit if it's alright.

Thank you,

-- 
Evandro Menezes



[-- Attachment #2: 0001-AArch64-Add-cost-model-for-Exynos-M1.patch --]
[-- Type: text/x-patch, Size: 11124 bytes --]

From 9b02c57fd2f2507dcc79767d7ffdb7ccec4cdd25 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 5 Nov 2015 17:58:47 -0600
Subject: [PATCH] [AArch64] Add cost model for Exynos M1

2015-10-25  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-cores.def: Use the Exynos M1 cost model.
	* config/aarch64/aarch64.c (exynosm1_addrcost_table): New variable.
	(exynosm1_regmove_cost): Likewise.
	(exynosm1_vector_cost): Likewise.
	(exynosm1_tunings): Likewise.
	* config/arm/aarch-cost-tables.h (exynosm1_extra_costs): Likewise.
	* config/arm/arm.c (arm_exynos_m1_tune): Likewise.
---
 gcc/config/aarch64/aarch64-cores.def |   2 +-
 gcc/config/aarch64/aarch64.c         |  66 ++++++++++++++++++++++
 gcc/config/arm/aarch-cost-tables.h   | 103 +++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm-cores.def         |   2 +-
 gcc/config/arm/arm.c                 |  23 ++++++++
 5 files changed, 194 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index c17baa3..607a333 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e7f1c07..d7d3f05 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -215,6 +215,22 @@ static const struct cpu_addrcost_table cortexa57_addrcost_table =
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table exynosm1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  1, /* register_offset  */
+  1, /* register_sextend  */
+  2, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
 static const struct cpu_addrcost_table xgene1_addrcost_table =
 {
     {
@@ -261,6 +277,16 @@ static const struct cpu_regmove_cost cortexa53_regmove_cost =
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost exynosm1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost (actual, 4 and 9).  */
+  9, /* GP2FP  */
+  9, /* FP2GP  */
+  1 /* FP2FP  */
+};
+
 static const struct cpu_regmove_cost thunderx_regmove_cost =
 {
   2, /* GP2GP  */
@@ -313,6 +339,22 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   1 /* cond_not_taken_branch_cost  */
 };
 
+static const struct cpu_vector_cost exynosm1_vector_cost =
+{
+  1, /* scalar_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* vec_stmt_cost  */
+  3, /* vec_to_scalar_cost  */
+  3, /* scalar_to_vec_cost  */
+  5, /* vec_align_load_cost  */
+  5, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1 /* cond_not_taken_branch_cost  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost xgene1_vector_cost =
 {
@@ -436,6 +478,30 @@ static const struct tune_params cortexa72_tunings =
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
+static const struct tune_params exynosm1_tunings =
+{
+  &exynosm1_extra_costs,
+  &exynosm1_addrcost_table,
+  &exynosm1_regmove_cost,
+  &exynosm1_vector_cost,
+  &generic_branch_cost,
+  4,	/* memmov_cost  */
+  3,	/* issue_rate  */
+  (AARCH64_FUSE_NOTHING), /* fusible_ops  */
+  4,	/* function_align.  */
+  4,	/* jump_align.  */
+  4,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  48,	/* max_case_values.  */
+  64,	/* cache_line_size.  */
+  tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
+};
+
 static const struct tune_params thunderx_tunings =
 {
   &thunderx_extra_costs,
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 66e09a8..850bde0 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -331,6 +331,109 @@ const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table exynosm1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (0), /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (3),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (35)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (21),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (34),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (0)  /* alu.  */
+  }
+};
+
 const struct cpu_cost_table xgene1_extra_costs =
 {
   /* ALU */
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 3448e82..18936f0 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 4310638..6d5a64e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1991,6 +1991,29 @@ const struct tune_params arm_cortex_a57_tune =
   tune_params::SCHED_AUTOPREF_FULL
 };
 
+const struct tune_params arm_exynos_m1_tune =
+{
+  arm_9e_rtx_costs,
+  &exynosm1_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  3,						/* Issue rate.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  tune_params::FUSE_NOTHING,
+  tune_params::SCHED_AUTOPREF_OFF
+};
+
 const struct tune_params arm_xgene1_tune =
 {
   arm_9e_rtx_costs,
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3/4][AArch64] Add scheduling model for Exynos M1
  2015-11-05 23:30   ` [PATCH 3/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
@ 2015-11-09 23:06     ` Evandro Menezes
  2015-11-10 17:50       ` [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models Evandro Menezes
  2015-11-10 17:54       ` [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
  0 siblings, 2 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-09 23:06 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

I think that it's better if I split this patch further in two others, 
since one of the changes is best if done in aarch64.md.

Please, disregard this patch and watch for updates.

Thank you,

-- 
Evandro Menezes

On 11/05/2015 05:30 PM, Evandro Menezes wrote:
> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>         * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched 
> model.
>         * config/aarch64/aarch64.md: Include "exynos-m1.md".
>         * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>         * config/arm/arm.md: Include "exynos-m1.md".
>         * config/arm/arm-tune.md: Regenerated.
>         * config/arm/exynos-m1.md: New file.
>
> This patch adds the scheduling model for Exynos M1.  I split the DFA 
> into ones for GP, LS and FP, resulting in many fewer states and arcs 
> than before.
>
> Please, commit if it's alright.
>
> Thank you,
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-09 23:06     ` Evandro Menezes
@ 2015-11-10 17:50       ` Evandro Menezes
  2015-11-10 18:01         ` Ramana Radhakrishnan
  2015-11-12 14:55         ` James Greenhalgh
  2015-11-10 17:54       ` [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
  1 sibling, 2 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-10 17:50 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 407 bytes --]

    2015-11-10  Evandro Menezes <e.menezes@samsung.com>

    gcc/

        * config/aarch64/aarch64.md (predicated): Copy attribute from
        "arm.md".

This patch duplicates an attribute from arm.md so that the same pipeline 
model can be used for both AArch32 and AArch64.

Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.

Please, commit if it's alright.

-- 
Evandro Menezes



[-- Attachment #2: 0001-AArch64-Add-attribute-for-compatibility-with-ARM-pip.patch --]
[-- Type: text/x-patch, Size: 1098 bytes --]

From 3b643a3c026350864713e1700dc44e4794d93809 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 9 Nov 2015 17:11:16 -0600
Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
 pipeline models

gcc/
	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
---
 gcc/config/aarch64/aarch64.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6b08850..2bc2ff5 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -195,6 +195,11 @@
 ;; 1 :=: yes
 (define_attr "far_branch" "" (const_int 0))
 
+;; [For compatibility with ARM in pipeline models]
+;; Attribute that specifies whether or not the instruction is executed
+;; conditionally (<C> != "AL"? "yes": "no").
+(define_attr "predicated" "yes,no" (const_string "no"))
+
 ;; -------------------------------------------------------------------
 ;; Pipeline descriptions and scheduling
 ;; -------------------------------------------------------------------
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-11-09 23:06     ` Evandro Menezes
  2015-11-10 17:50       ` [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models Evandro Menezes
@ 2015-11-10 17:54       ` Evandro Menezes
  2015-11-19 22:06         ` Evandro Menezes
  2015-11-20 17:17         ` James Greenhalgh
  1 sibling, 2 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-10 17:54 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 670 bytes --]

    2015-11-10  Evandro Menezes <e.menezes@samsung.com>

    gcc/

        * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
        * config/aarch64/aarch64.md: Include "exynos-m1.md".
        * config/arm/arm-cores.def: Use the Exynos M1 sched model.
        * config/arm/arm.md: Include "exynos-m1.md".
        * config/arm/arm-tune.md: Regenerated.
        * config/arm/exynos-m1.md: New file.

This patch adds the scheduling model for Exynos M1.  It depends on 
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01257.html

Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.

Please, commit if it's alright.

-- 
Evandro Menezes



[-- Attachment #2: 0002-AArch64-Add-scheduling-model-for-Exynos-M1.patch --]
[-- Type: text/x-patch, Size: 36382 bytes --]

From 0b7b6d597e5877c78c4d88e0d4491858555a5364 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 9 Nov 2015 17:18:52 -0600
Subject: [PATCH 2/2] [AArch64] Add scheduling model for Exynos M1

gcc/
	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
	* config/aarch64/aarch64.md: Include "exynos-m1.md".
	* config/arm/arm-cores.def: Use the Exynos M1 sched model.
	* config/arm/arm.md: Include "exynos-m1.md".
	* config/arm/arm-tune.md: Regenerated.
	* config/arm/exynos-m1.md: New file.
---
 gcc/config/aarch64/aarch64-cores.def |   2 +-
 gcc/config/aarch64/aarch64.md        |   1 +
 gcc/config/arm/arm-cores.def         |   2 +-
 gcc/config/arm/arm.md                |   3 +-
 gcc/config/arm/exynos-m1.md          | 947 +++++++++++++++++++++++++++++++++++
 5 files changed, 952 insertions(+), 3 deletions(-)
 create mode 100644 gcc/config/arm/exynos-m1.md

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 0ab1ca8..c17baa3 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 2bc2ff5..18f5547 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -210,6 +210,7 @@
 ;; Scheduling
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a57.md")
+(include "../arm/exynos-m1.md")
 (include "thunderx.md")
 (include "../arm/xgene1.md")
 
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 4c35200..3448e82 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 8ebb1bf..f14cd0e 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -377,7 +377,7 @@
                                 arm1136jfs,cortexa5,cortexa7,cortexa8,\
                                 cortexa9,cortexa12,cortexa15,cortexa17,\
                                 cortexa53,cortexa57,cortexm4,cortexm7,\
-				marvell_pj4,xgene1")
+				exynosm1,marvell_pj4,xgene1")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -416,6 +416,7 @@
 (include "cortex-m7.md")
 (include "cortex-m4.md")
 (include "cortex-m4-fpu.md")
+(include "exynos-m1.md")
 (include "vfp11.md")
 (include "marvell-pj4.md")
 (include "xgene1.md")
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
new file mode 100644
index 0000000..fd73353
--- /dev/null
+++ b/gcc/config/arm/exynos-m1.md
@@ -0,0 +1,947 @@
+;; Samsung Exynos M1 pipeline description
+;; Copyright (C) 2014-2015 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_attr "exynos_m1_neon_type"
+  "neon_arith_simple, neon_arith_basic, neon_arith_complex,
+   neon_multiply, neon_mla, neon_mla_q, neon_mla_long, neon_sat_mla_long,
+   neon_shift_acc, neon_shift_imm_basic, neon_shift_imm_complex,
+   neon_shift_reg_basic, neon_shift_reg_basic_q,
+   neon_shift_reg_complex, neon_shift_reg_complex_q,
+   neon_fp_unary, neon_fp_add, neon_fp_abd, neon_fp_compare,
+   neon_fp_reduc_minmax, neon_fp_reduc_add, neon_fp_round, neon_fp_cvt,
+   neon_fp_minmax, neon_fp_mul, neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q,
+   neon_fp_estimate, neon_fp_estimatex, neon_fp_step,
+   neon_bitops, neon_bitops_q, neon_bitins,
+   neon_to_gp, neon_from_gp, neon_move, neon_tbl,
+   neon_load1_1, neon_load1_2, neon_load1_3, neon_load1_4,
+   neon_load1_one, neon_load1_all,
+   neon_load2_2, neon_load2_one, neon_load2_all,
+   neon_load3_3, neon_load3_one, neon_load3_all,
+   neon_load4_4, neon_load4_one, neon_load4_all,
+   neon_store,
+   neon_store1_1, neon_store1_2, neon_store1_3, neon_store1_4, neon_store1_one,
+   neon_store2_2, neon_store2_one,
+   neon_store3_3, neon_store3_one,
+   neon_store4_4, neon_store4_one,
+   unknown"
+  (cond [
+	  (eq_attr "type" "neon_abd, neon_abd_q, neon_abd_long,\
+			   neon_abs, neon_abs_q,\
+			   neon_minmax, neon_minmax_q")
+	    (const_string "neon_arith_simple")
+
+	  (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
+			   neon_neg, neon_neg_q,\
+			   neon_sub, neon_sub_q, neon_sub_long, neon_sub_widen,\
+			   neon_logic, neon_logic_q, neon_tst, neon_tst_q,\
+			   neon_compare_zero, neon_compare_zero_q")
+	    (const_string "neon_arith_basic")
+
+	  (eq_attr "type" "neon_add_widen, neon_arith_acc, neon_arith_acc_q,\
+			   neon_reduc_add, neon_reduc_add_q,\
+			   neon_reduc_add_acc, neon_reduc_add_acc_q,\
+			   neon_reduc_add_long, neon_add_halve_narrow_q,\
+			   neon_add_halve, neon_add_halve_q,\
+			   neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+			   neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+			   neon_qneg_q, neon_qsub, neon_qsub_q,\
+			   neon_sub_halve_narrow_q,\
+			   neon_compare, neon_compare_q,\
+			   neon_reduc_minmax, neon_reduc_minmax_q")
+	    (const_string "neon_arith_complex")
+
+	  (eq_attr "type" "neon_mul_b, neon_mul_b_q, neon_mul_h, neon_mul_h_q,\
+			   neon_mul_s, neon_mul_s_q,\
+			   neon_mul_h_scalar, neon_mul_h_scalar_q,\
+			   neon_mul_s_scalar, neon_mul_s_scalar_q,\
+			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+			   neon_sat_mul_b, neon_sat_mul_b_q,\
+			   neon_sat_mul_h, neon_sat_mul_h_q,\
+			   neon_sat_mul_s, neon_sat_mul_s_q,\
+			   neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\
+			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
+			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
+			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+			   neon_sat_mul_s_scalar_long")
+	    (const_string "neon_multiply")
+
+	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
+			   neon_mla_h_scalar, neon_mla_s_scalar,\
+			   neon_mla_b_long, neon_mla_h_long,\
+			   neon_mla_s_long,\
+			   neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+			   neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+			   neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+	    (const_string "neon_mla")
+
+	  (eq_attr "type" "neon_sat_mla_b_long, neon_sat_mla_h_long,\
+			   neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+			   neon_sat_mla_s_scalar_long")
+	    (const_string "neon_sat_mla_long")
+
+	  (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+	    (const_string "neon_shift_acc")
+
+	  (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+			   neon_shift_imm_narrow_q, neon_shift_imm_long")
+	    (const_string "neon_shift_imm_basic")
+
+	  (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+			   neon_sat_shift_imm_narrow_q")
+	    (const_string "neon_shift_imm_complex")
+
+	  (eq_attr "type" "neon_shift_reg, neon_shift_reg_q")
+	    (const_string "neon_shift_reg_basic")
+
+	  (eq_attr "type" "neon_sat_shift_reg, neon_sat_shift_reg_q")
+	    (const_string "neon_shift_reg_complex")
+
+	  (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+			   neon_fp_abs_s, neon_fp_abs_s_q,\
+			   neon_fp_neg_d, neon_fp_neg_d_q,\
+			   neon_fp_abs_d, neon_fp_abs_d_q")
+	    (const_string "neon_fp_unary")
+
+	  (eq_attr "type" "neon_fp_addsub_s, neon_fp_addsub_s_q,\
+			   neon_fp_addsub_d, neon_fp_addsub_d_q")
+	    (const_string "neon_fp_add")
+
+	  (eq_attr "type" "neon_fp_abd_s, neon_fp_abd_s_q,\
+			   neon_fp_abd_d, neon_fp_abd_d_q")
+	    (const_string "neon_fp_abd")
+
+	  (eq_attr "type" "neon_fp_compare_s, neon_fp_compare_s_q,\
+			   neon_fp_compare_d, neon_fp_compare_d_q,\
+			   neon_fp_minmax_s, neon_fp_minmax_s_q,\
+			   neon_fp_minmax_d, neon_fp_minmax_d_q")
+	    (const_string "neon_fp_compare")
+
+	  (eq_attr "type" "neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q,\
+			   neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q")
+	    (const_string "neon_fp_reduc_minmax")
+
+	  (eq_attr "type" "neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+			   neon_fp_reduc_add_d, neon_fp_reduc_add_d_q")
+	    (const_string "neon_fp_reduc_add")
+
+	  (eq_attr "type" "neon_fp_round_s, neon_fp_round_s_q,\
+			   neon_fp_round_d, neon_fp_round_d_q")
+	    (const_string "neon_fp_round")
+
+	  (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h,
+			   neon_fp_to_int_s, neon_fp_to_int_s_q,\
+			   neon_fp_to_int_d_q, neon_fp_to_int_d,\
+			   neon_int_to_fp_s, neon_int_to_fp_s_q,\
+			   neon_int_to_fp_d, neon_int_to_fp_d_q")
+	    (const_string "neon_fp_cvt")
+
+	  (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_q,\
+			   neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q,\
+			   neon_fp_mul_d, neon_fp_mul_d_q,\
+			   neon_fp_mul_d_scalar_q")
+	    (const_string "neon_fp_mul")
+
+	  (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_q,\
+			   neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
+			   neon_fp_mla_d, neon_fp_mla_d_q,\
+			   neon_fp_mla_d_scalar_q")
+	    (const_string "neon_fp_mla")
+
+	  (eq_attr "type" "neon_fp_recpe_s, neon_fp_recpe_s_q,\
+			   neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			   neon_fp_recpe_d, neon_fp_recpe_d_q,\
+			   neon_fp_rsqrte_d, neon_fp_rsqrte_d_q")
+	    (const_string "neon_fp_estimate")
+
+	  (eq_attr "type" "neon_fp_recpx_s, neon_fp_recpx_s_q,\
+			   neon_fp_recpx_d, neon_fp_recpx_d_q")
+	    (const_string "neon_fp_estimatex")
+
+	  (eq_attr "type" "neon_fp_recps_s, neon_fp_recps_s_q,\
+			   neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			   neon_fp_recps_d, neon_fp_recps_d_q,\
+			   neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")
+	    (const_string "neon_fp_step")
+
+	  (eq_attr "type" "neon_rbit, neon_rbit_q,\
+			   neon_cls, neon_cls_q, neon_cnt, neon_cnt_q,\
+			   neon_dup, neon_dup_q,\
+			   neon_rev, neon_rev_q,\
+			   neon_move, neon_move_q,
+			   neon_ext, neon_permute, neon_zip")
+	    (const_string "neon_bitops")
+
+	  (eq_attr "type" "neon_ext_q, neon_permute_q, neon_zip_q")
+	    (const_string "neon_bitops_q")
+
+	  (eq_attr "type" "neon_bsl, neon_bsl_q")
+	    (const_string "neon_bitins")
+
+	  (eq_attr "type" "neon_tbl1, neon_tbl2, neon_tbl3, neon_tbl4")
+	    (const_string "neon_tbl")
+
+	  (eq_attr "type" "neon_from_gp, neon_from_gp_q, f_mcr, f_mcrr")
+	    (const_string "neon_from_gp")
+
+	  (eq_attr "type" "neon_to_gp, neon_to_gp_q, f_mrc, f_mrrc")
+	    (const_string "neon_to_gp")
+
+	  (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q")
+	    (const_string "neon_load1_1")
+
+	  (eq_attr "type" "neon_load1_2reg, neon_load1_2reg_q")
+	    (const_string "neon_load1_2")
+
+	  (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q")
+	    (const_string "neon_load1_3")
+
+	  (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q")
+	    (const_string "neon_load1_4")
+
+	  (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q")
+	    (const_string "neon_load1_one")
+
+	  (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q")
+	    (const_string "neon_load1_all")
+
+	  (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q,\
+			   neon_load2_4reg, neon_load2_4reg_q")
+	    (const_string "neon_load2_2")
+
+	  (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q")
+	    (const_string "neon_load2_one")
+
+	  (eq_attr "type" "neon_load2_all_lanes, neon_load2_all_lanes_q")
+	    (const_string "neon_load2_all")
+
+	  (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q")
+	    (const_string "neon_load3_3")
+
+	  (eq_attr "type" "neon_load3_one_lane, neon_load3_one_lane_q")
+	    (const_string "neon_load3_one")
+
+	  (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q")
+	    (const_string "neon_load3_all")
+
+	  (eq_attr "type" "neon_load4_4reg, neon_load4_4reg_q")
+	    (const_string "neon_load4_4")
+
+	  (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+	    (const_string "neon_load4_one")
+
+	  (eq_attr "type" "neon_load4_all_lanes, neon_load4_all_lanes_q")
+	    (const_string "neon_load4_all")
+
+	  (eq_attr "type" "f_stores, f_stored,\
+			   neon_stp, neon_stp_q")
+	    (const_string "neon_store")
+
+	  (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q")
+	    (const_string "neon_store1_1")
+
+	  (eq_attr "type" "neon_store1_2reg, neon_store1_2reg_q")
+	    (const_string "neon_store1_2")
+
+	  (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q")
+	    (const_string "neon_store1_3")
+
+	  (eq_attr "type" "neon_store1_4reg, neon_store1_4reg_q")
+	    (const_string "neon_store1_4")
+
+	  (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q")
+	    (const_string "neon_store1_one")
+
+	  (eq_attr "type" "neon_store2_2reg, neon_store2_2reg_q,\
+			   neon_store2_4reg, neon_store2_4reg_q")
+	    (const_string "neon_store2_2")
+
+	  (eq_attr "type" "neon_store2_one_lane, neon_store2_one_lane_q")
+	    (const_string "neon_store2_one")
+
+	  (eq_attr "type" "neon_store3_3reg, neon_store3_3reg_q")
+	    (const_string "neon_store3_3")
+
+	  (eq_attr "type" "neon_store3_one_lane, neon_store3_one_lane_q")
+	    (const_string "neon_store3_one")
+
+	  (eq_attr "type" "neon_store4_4reg, neon_store4_4reg_q")
+	    (const_string "neon_store4_4")
+
+	  (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q")
+	    (const_string "neon_store4_one")]
+
+	  (const_string "unknown")))
+
+;; The Exynos M1 core is modeled as a triple issue pipeline that has
+;; the following functional units.
+
+(define_automaton "exynos_m1_gp")
+(define_automaton "exynos_m1_ls")
+(define_automaton "exynos_m1_fp")
+
+;; 1.  Two pipelines for simple integer operations: A, B
+;; 2.  One pipeline for simple or complex integer operations: C
+
+(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
+
+(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
+(define_reservation "em1_c" "em1_xc")
+
+;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
+
+(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
+
+(define_reservation "em1_fmac" "em1_f0")
+(define_reservation "em1_fcvt" "em1_f0")
+(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
+(define_reservation "em1_nalu0" "em1_f0")
+(define_reservation "em1_nalu1" "em1_f1")
+(define_reservation "em1_nmisc" "em1_f0")
+(define_reservation "em1_ncrypt" "em1_f0")
+(define_reservation "em1_fadd" "em1_f1")
+(define_reservation "em1_fvar" "em1_f1")
+(define_reservation "em1_fst" "em1_f1")
+
+;; 4.  One pipeline for branch operations: BX
+
+(define_cpu_unit "em1_bx" "exynos_m1_gp")
+
+(define_reservation "em1_br" "em1_bx")
+
+;; 5.  One AGU for loads: L
+;;     One AGU for stores and one pipeline for stores: S, SD
+
+(define_cpu_unit "em1_lx" "exynos_m1_ls")
+(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
+
+(define_reservation "em1_ld" "em1_lx")
+(define_reservation "em1_st" "(em1_sx + em1_sd)")
+
+;; Common occurrences
+(define_reservation "em1_sfst" "(em1_fst + em1_st)")
+(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
+
+;; Branches
+;;
+;; No latency as there is no result
+;; TODO: Unconditional branches use no units;
+;; conditional branches add the BX unit;
+;; indirect branches add the C unit.
+(define_insn_reservation "exynos_m1_branch" 0
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "branch"))
+  "em1_br")
+
+(define_insn_reservation "exynos_m1_call" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "call"))
+  "em1_alu")
+
+;; Basic ALU
+;;
+;; Simple ALU without shift, non-predicated
+(define_insn_reservation "exynos_m1_alu" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_alu")
+
+;; Simple ALU without shift, predicated
+(define_insn_reservation "exynos_m1_alu_p" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_c")
+
+;; ALU ops with immediate shift
+;; TODO: if the shift value is between 0 and 3, the latency is just 1 cycle;
+;;       otherwise it takes 2 cycles and the unit is blocked;
+;;       for now, assume the latter's latency and the former's units.
+(define_insn_reservation "exynos_m1_alu_shift" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "alu_ext, alus_ext,\
+			alu_shift_imm, alus_shift_imm,\
+			logic_shift_imm, logics_shift_imm,\
+			mov_shift, mvn_shift"))
+  "(em1_alu)")
+
+;; ALU ops with register controlled shift, non-predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu * 2)")
+
+;; ALU ops with register controlled shift, predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg_p" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+  "(em1_alu, em1_c)")
+
+;; Integer multiply
+(define_insn_reservation "exynos_m1_mla" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "mul32" "yes"))
+  "em1_c")
+
+(define_insn_reservation "exynos_m1_mlal" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "mul64" "yes"))
+  "em1_alu, em1_c")
+
+;; Integer divide
+;; TODO: assume the median latency; blocks other divisions
+(define_insn_reservation "exynos_m1_div" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "udiv, sdiv"))
+  "em1_c")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to 2 words.
+(define_insn_reservation "exynos_m1_load" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load_byte, load1, load2"))
+  "em1_ld")
+
+;; Loads of 3 or 4 words.
+(define_insn_reservation "exynos_m1_loadm" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load3, load4"))
+  "(em1_ld * 3)")
+
+;; Stores of up to 2 words.
+(define_insn_reservation "exynos_m1_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store1, store2"))
+  "em1_st")
+
+;; Stores of 3 or 4 words.
+(define_insn_reservation "exynos_m1_storem" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store3, store4"))
+  "(em1_st * 3)")
+
+;; Advanced SIMD Unit
+;;
+;; Integer Arithmetic Instructions.
+
+(define_insn_reservation  "exynos_m1_arith_simple" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_simple"))
+  "em1_nmisc")
+
+(define_insn_reservation  "exynos_m1_neon_arith_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_basic"))
+  "em1_nalu")
+
+(define_insn_reservation  "exynos_m1_neon_arith_complex" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_complex"))
+  "em1_nmisc")
+
+;; Integer Multiply Instructions.
+
+(define_insn_reservation "exynos_m1_neon_multiply" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_multiply, neon_mla, neon_sat_mla_long"))
+  "em1_nmisc")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_acc" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_shift_acc"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_basic, neon_shift_reg_basic"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_complex" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_complex, neon_shift_reg_complex"))
+  "em1_nalu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_unary" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_unary"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_add"))
+  "em1_fadd")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_abd" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_abd"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_compare" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_compare"))
+  "em1_nmisc")
+
+;; TODO: the latency and throughput of reduce insns actually varies between
+;; 3-5 and 1/4-1, but picked the median values.
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_minmax"))
+  "(em1_nmisc * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc_add" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_add"))
+  "((em1_nalu * 2), em1_fadd)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_round" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_round"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_cvt"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mul"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mla" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mla"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimate" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimate"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimatex" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimatex"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_step" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_step"))
+  "em1_fmac")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops_q" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops_q"))
+  "(em1_nalu, em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitins" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitins"))
+  "em1_nalu1")
+
+;; TODO: it is more complicated than this.
+(define_insn_reservation
+  "exynos_m1_neon_tbl" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_tbl"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_from_gp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_from_gp"))
+  "em1_st")
+
+(define_insn_reservation
+  "exynos_m1_neon_to_gp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_to_gp"))
+  "em1_lfst")
+
+;; Load Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_load" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_loads, f_loadd, neon_ldp"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load_q" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_ldp_q"))
+  "(em1_ld, em1_ld)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_1" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_1, neon_load1_all"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_2" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_2"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_3" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_3"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_4" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_4"))
+  "(em1_ld * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_one"))
+  "((em1_ld * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_2" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_2"))
+  "(em1_ld * 5)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_one"))
+  "((em1_ld * 2), (em1_nalu * 2))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_all" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_all"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_3" 12
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_3"))
+  "(em1_ld * 6)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_one"))
+  "((em1_ld * 4), (em1_nalu * 3))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_all" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_all"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_4" 14
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_4"))
+  "(em1_ld * 7)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_one"))
+  "((em1_ld * 4), (em1_nalu * 4))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_all" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_all"))
+  "(em1_ld * 4)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_1" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_1"))
+  "em1_sfst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_2" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_2"))
+  "(em1_sfst * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_3" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_3"))
+  "(em1_sfst * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_4" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_4"))
+  "(em1_sfst * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store2" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store2_2, neon_store2_one"))
+  "em1_sfst, em1_fst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store3" 16
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store3_3, neon_store3_one"))
+  "((em1_sfst * 3), (em1_fst * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store4" 17
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store4_4, neon_store4_one"))
+  "((em1_sfst * 4), (em1_fst * 2), em1_nalu)")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "exynos_m1_fp_const" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fconsts, fconstd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fadds, faddd"))
+  "em1_fadd")
+
+(define_insn_reservation "exynos_m1_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmuls, fmuld"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_mac" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmacs, ffmas, fmacd, ffmad"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvt, f_rints, f_rintd"))
+  "em1_fcvt")
+
+(define_insn_reservation "exynos_m1_fp_cvt_i" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvtf2i"))
+  "(em1_fcvt, em1_lfst)")
+
+(define_insn_reservation "exynos_m1_i_cvt_fp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvti2f"))
+  "(em1_st, em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_fp_cmp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcmps, fcmpd"))
+  "em1_nmisc")
+
+(define_insn_reservation "exynos_m1_fp_sel" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcsel"))
+  "(em1_st + em1_nalu0)")
+
+(define_insn_reservation "exynos_m1_fp_arith" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "ffariths, ffarithd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_cpy" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmov"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_divs" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivs, neon_fp_div_s, neon_fp_div_s_q,\
+			fsqrts, neon_fp_sqrt_s, neon_fp_sqrt_s_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_divd" 22
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivd, neon_fp_div_d, neon_fp_div_d_q,\
+			fsqrtd, neon_fp_sqrt_d, neon_fp_sqrt_d_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_minmax" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_minmaxs, f_minmaxd"))
+  "(em1_nmisc * 2)")
+
+;; Crypto Operations.
+
+(define_insn_reservation "exynos_m1_crypto_simple" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_aese, crypto_aesmc,\
+			crypto_sha1_xor, crypto_sha1_fast, crypto_sha256_fast"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_complex" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_poly" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_b_long, neon_mul_h_long, neon_mul_s_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_polyl" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_d_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crc" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crc"))
+  "em1_c")
+
+;; Simple execution unit bypasses
+
+;; Pre-decrement and post-increment addressing modes update the register quickly.
+;; TODO: figure out how to tell the addressing mode register from the loaded one.
+(define_bypass 1 "exynos_m1_store*" "exynos_m1_store*")
+
+;; MLAs can feed other MLAs quickly.
+(define_bypass 1 "exynos_m1_mla*" "exynos_m1_mla*")
+
+;; Insns in FMAC or FADD can feed other such insns quickly.
+(define_bypass 4 "exynos_m1_fp_mul"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 5 "exynos_m1_fp_mac"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 4 "exynos_m1_neon_fp_mul"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+(define_bypass 5 "exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+(define_bypass 3 "exynos_m1_fp_add"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 3 "exynos_m1_neon_fp_add"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+;; Insns in NALU can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 3 "exynos_m1_fp_sel"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 1 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_bitops, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 3 "exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 1 "exynos_m1_neon_fp_unary" "exynos_m1_neon_fp_unary")
+
+;; Insns in NCRYPT can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_crypto_simple, exynos_m1_crypto_poly"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 3 "exynos_m1_crypto_polyl"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 5 "exynos_m1_crypto_complex"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+
+;; Predicted branches take no time, but mispredicted ones take forever anyway.
+(define_bypass 1 "exynos_m1_*"
+		 "exynos_m1_call, exynos_m1_branch")
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-10 17:50       ` [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models Evandro Menezes
@ 2015-11-10 18:01         ` Ramana Radhakrishnan
  2015-11-10 18:03           ` Ramana Radhakrishnan
  2015-11-12 14:55         ` James Greenhalgh
  1 sibling, 1 reply; 52+ messages in thread
From: Ramana Radhakrishnan @ 2015-11-10 18:01 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: gcc-patches, Marcus Shawcroft, James Greenhalgh, Kyrill Tkachov,
	Andrew Pinski

On Tue, Nov 10, 2015 at 5:50 PM, Evandro Menezes <e.menezes@samsung.com> wrote:
>    2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>
>        * config/aarch64/aarch64.md (predicated): Copy attribute from
>        "arm.md".
>
> This patch duplicates an attribute from arm.md so that the same pipeline
> model can be used for both AArch32 and AArch64.

I'm not an aarch64 maintainer so I cannot approve.

There are no predicated instructions in aarch64 - thus it's best imho
to have only one option, "no" and not even give the option for someone
to accidentally set this to yes.

regards
Ramana


>
> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>
> Please, commit if it's alright.
>
> --
> Evandro Menezes
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-10 18:01         ` Ramana Radhakrishnan
@ 2015-11-10 18:03           ` Ramana Radhakrishnan
  0 siblings, 0 replies; 52+ messages in thread
From: Ramana Radhakrishnan @ 2015-11-10 18:03 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: gcc-patches, Marcus Shawcroft, James Greenhalgh, Kyrill Tkachov,
	Andrew Pinski

On Tue, Nov 10, 2015 at 6:01 PM, Ramana Radhakrishnan
<ramana.gcc@googlemail.com> wrote:
> On Tue, Nov 10, 2015 at 5:50 PM, Evandro Menezes <e.menezes@samsung.com> wrote:
>>    2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>
>>    gcc/
>>
>>        * config/aarch64/aarch64.md (predicated): Copy attribute from
>>        "arm.md".
>>
>> This patch duplicates an attribute from arm.md so that the same pipeline
>> model can be used for both AArch32 and AArch64.
>
> I'm not an aarch64 maintainer so I cannot approve.
>
> There are no predicated instructions in aarch64 - thus it's best imho
> to have only one option, "no" and not even give the option for someone
> to accidentally set this to yes.

Scratch that - I had a brain fade.

Ramana

>
> regards
> Ramana
>
>
>>
>> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>>
>> Please, commit if it's alright.
>>
>> --
>> Evandro Menezes
>>
>>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/4][AArch64] Add scheduling and cost models for Exynos M1
  2015-11-05 17:31         ` Evandro Menezes
@ 2015-11-12 14:47           ` James Greenhalgh
  0 siblings, 0 replies; 52+ messages in thread
From: James Greenhalgh @ 2015-11-12 14:47 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On Thu, Nov 05, 2015 at 11:31:33AM -0600, Evandro Menezes wrote:
> James,
> 
> Since other members of the "tune_params" structure were signed
> integers, even though negative numbers would make no sense for most
> either, I followed the same pattern.
> 
> Regardless, here's a patch with unsigned integers as you requested:
> 
>    [AArch64] Add extra tuning parameters for target processors
> 
>    2015-11-05  Evandro Menezes  <e.menezes@samsung.com>
> 
>    gcc/
> 
>        * config/aarch64/aarch64-protos.h (tune_params): Add new members
>        "max_case_values" and "cache_line_size".
>        * config/aarch64/aarch64.c (aarch64_case_values_threshold): New
>        function.
>        (aarch64_override_options_internal): Tune heuristics based on new
>        members in "tune_params".
>        (TARGET_CASE_VALUES_THRESHOLD): Define macro.
> 
> Please, commit if it's alright.

Hi Evandro,

This is OK with a few nits.

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 81792bc..ecf4685 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -195,6 +195,9 @@ struct tune_params
>    int vec_reassoc_width;
>    int min_div_recip_mul_sf;
>    int min_div_recip_mul_df;
> +  unsigned int max_case_values; /* Case values threshold; or 0 for the default.  */
> +
> +  unsigned int cache_line_size; /* Cache line size; or 0 for the default.  */
>  
>  /* An enum specifying how to take into account CPU autoprefetch capabilities
>     during instruction scheduling:

I'd put the comments above the field, and make them slightly more
descriptive:

+  /* Value for aarch64_case_values_threshold; or 0 for the default.  */
+  unsigned int max_case_values;
+  /* Value for PARAM_L1_CACHE_LINE_SIZE; or 0 to use the default.  */
+  unsigned int cache_line_size;

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5c8604f..e7f1c07 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13385,6 +13418,7 @@ aarch64_promoted_type (const_tree t)
>      return float_type_node;
>    return NULL_TREE;
>  }
> +
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST aarch64_address_cost
>  

Drop this hunk.

I've applied the patch with those changes as revision 230261 on your behalf.

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-10 17:50       ` [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models Evandro Menezes
  2015-11-10 18:01         ` Ramana Radhakrishnan
@ 2015-11-12 14:55         ` James Greenhalgh
  2015-11-12 15:39           ` Evandro Menezes
  1 sibling, 1 reply; 52+ messages in thread
From: James Greenhalgh @ 2015-11-12 14:55 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On Tue, Nov 10, 2015 at 11:50:12AM -0600, Evandro Menezes wrote:
>    2015-11-10  Evandro Menezes <e.menezes@samsung.com>
> 
>    gcc/
> 
>        * config/aarch64/aarch64.md (predicated): Copy attribute from
>        "arm.md".
> 
> This patch duplicates an attribute from arm.md so that the same
> pipeline model can be used for both AArch32 and AArch64.
> 
> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
> 
> Please, commit if it's alright.
> 
> -- 
> Evandro Menezes
> 
> 

> From 3b643a3c026350864713e1700dc44e4794d93809 Mon Sep 17 00:00:00 2001
> From: Evandro Menezes <e.menezes@samsung.com>
> Date: Mon, 9 Nov 2015 17:11:16 -0600
> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>  pipeline models
> 
> gcc/
> 	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
> ---
>  gcc/config/aarch64/aarch64.md | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6b08850..2bc2ff5 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -195,6 +195,11 @@
>  ;; 1 :=: yes
>  (define_attr "far_branch" "" (const_int 0))
>  
> +;; [For compatibility with ARM in pipeline models]
> +;; Attribute that specifies whether or not the instruction is executed
> +;; conditionally (<C> != "AL"? "yes": "no").

I'm not sure this <C> != "AL" [...] part makes sense to me (thinking only
of AArch64, I'd understand it on AArch32 :) ) and we should document that
this is never set for AArch64. Could you respin with a slightly clearer
comment.

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-12 14:55         ` James Greenhalgh
@ 2015-11-12 15:39           ` Evandro Menezes
  2015-11-12 17:32             ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-12 15:39 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/12/2015 08:55 AM, James Greenhalgh wrote:
> On Tue, Nov 10, 2015 at 11:50:12AM -0600, Evandro Menezes wrote:
>>     2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>
>>     gcc/
>>
>>         * config/aarch64/aarch64.md (predicated): Copy attribute from
>>         "arm.md".
>>
>> This patch duplicates an attribute from arm.md so that the same
>> pipeline model can be used for both AArch32 and AArch64.
>>
>> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>>
>> Please, commit if it's alright.
>>
>> -- 
>> Evandro Menezes
>>
>>
>>  From 3b643a3c026350864713e1700dc44e4794d93809 Mon Sep 17 00:00:00 2001
>> From: Evandro Menezes <e.menezes@samsung.com>
>> Date: Mon, 9 Nov 2015 17:11:16 -0600
>> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>>   pipeline models
>>
>> gcc/
>> 	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
>> ---
>>   gcc/config/aarch64/aarch64.md | 5 +++++
>>   1 file changed, 5 insertions(+)
>>
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 6b08850..2bc2ff5 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -195,6 +195,11 @@
>>   ;; 1 :=: yes
>>   (define_attr "far_branch" "" (const_int 0))
>>   
>> +;; [For compatibility with ARM in pipeline models]
>> +;; Attribute that specifies whether or not the instruction is executed
>> +;; conditionally (<C> != "AL"? "yes": "no").
> I'm not sure this <C> != "AL" [...] part makes sense to me (thinking only
> of AArch64, I'd understand it on AArch32 :) ) and we should document that
> this is never set for AArch64. Could you respin with a slightly clearer
> comment.
Since this attribute was not described in config/arm/arm.md, I felt that 
it needed to, but perhaps in its original place instead.  I agree that I 
should also point out that it's strictly for compatibility with AArch32 
and that it never matters for AArch64.

WRT the <C> thing, I was referring to the opcode fields terminology used 
by ARM in its ISA documentation.  Perhaps it's unnecessary, yes?

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-12 15:39           ` Evandro Menezes
@ 2015-11-12 17:32             ` Evandro Menezes
  2015-11-19 22:05               ` Evandro Menezes
  2015-11-20 12:27               ` James Greenhalgh
  0 siblings, 2 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-12 17:32 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 2578 bytes --]

On 11/12/2015 09:39 AM, Evandro Menezes wrote:
> On 11/12/2015 08:55 AM, James Greenhalgh wrote:
>> On Tue, Nov 10, 2015 at 11:50:12AM -0600, Evandro Menezes wrote:
>>>     2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>>
>>>     gcc/
>>>
>>>         * config/aarch64/aarch64.md (predicated): Copy attribute from
>>>         "arm.md".
>>>
>>> This patch duplicates an attribute from arm.md so that the same
>>> pipeline model can be used for both AArch32 and AArch64.
>>>
>>> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>>>
>>> Please, commit if it's alright.
>>>
>>> -- 
>>> Evandro Menezes
>>>
>>>
>>>  From 3b643a3c026350864713e1700dc44e4794d93809 Mon Sep 17 00:00:00 2001
>>> From: Evandro Menezes <e.menezes@samsung.com>
>>> Date: Mon, 9 Nov 2015 17:11:16 -0600
>>> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>>>   pipeline models
>>>
>>> gcc/
>>>     * config/aarch64/aarch64.md (predicated): Copy attribute from 
>>> "arm.md".
>>> ---
>>>   gcc/config/aarch64/aarch64.md | 5 +++++
>>>   1 file changed, 5 insertions(+)
>>>
>>> diff --git a/gcc/config/aarch64/aarch64.md 
>>> b/gcc/config/aarch64/aarch64.md
>>> index 6b08850..2bc2ff5 100644
>>> --- a/gcc/config/aarch64/aarch64.md
>>> +++ b/gcc/config/aarch64/aarch64.md
>>> @@ -195,6 +195,11 @@
>>>   ;; 1 :=: yes
>>>   (define_attr "far_branch" "" (const_int 0))
>>>   +;; [For compatibility with ARM in pipeline models]
>>> +;; Attribute that specifies whether or not the instruction is executed
>>> +;; conditionally (<C> != "AL"? "yes": "no").
>> I'm not sure this <C> != "AL" [...] part makes sense to me (thinking 
>> only
>> of AArch64, I'd understand it on AArch32 :) ) and we should document 
>> that
>> this is never set for AArch64. Could you respin with a slightly clearer
>> comment.
> Since this attribute was not described in config/arm/arm.md, I felt 
> that it needed to, but perhaps in its original place instead.  I agree 
> that I should also point out that it's strictly for compatibility with 
> AArch32 and that it never matters for AArch64.
>
> WRT the <C> thing, I was referring to the opcode fields terminology 
> used by ARM in its ISA documentation.  Perhaps it's unnecessary, yes?
>

    2015-11-12  Evandro Menezes <e.menezes@samsung.com>

    [AArch64] Add attribute for compatibility with ARM pipeline models

    gcc/

        * config/aarch64/aarch64.md (predicated): Copy attribute from
        "arm.md".
        * config/arm/arm.md (predicated): Added description.

Please, commit if it's alright.

-- 
Evandro Menezes


[-- Attachment #2: 0001-AArch64-Add-attribute-for-compatibility-with-ARM-pip.patch --]
[-- Type: text/x-patch, Size: 1645 bytes --]

From 3fa6a2bca8f3d2992b4607cff0afcc2d9caa96f4 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 9 Nov 2015 17:11:16 -0600
Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
 pipeline models

gcc/
	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
	* config/arm/arm.md (predicated): Added description.
---
 gcc/config/aarch64/aarch64.md | 4 ++++
 gcc/config/arm/arm.md         | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1586256..d46f837 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -195,6 +195,10 @@
 ;; 1 :=: yes
 (define_attr "far_branch" "" (const_int 0))
 
+;; Strictly for compatibility with AArch32 in pipeline models, since AArch64 has
+;; no predicated insns.
+(define_attr "predicated" "yes,no" (const_string "no"))
+
 ;; -------------------------------------------------------------------
 ;; Pipeline descriptions and scheduling
 ;; -------------------------------------------------------------------
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 73c3088..6bda491 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -105,6 +105,9 @@
 (define_attr "fpu" "none,vfp"
   (const (symbol_ref "arm_fpu_attr")))
 
+; Predicated means that the insn form is conditionally executed based on a
+; predicate.  We default to 'no' because no Thumb patterns match this rule
+; and not all ARM insns do.
 (define_attr "predicated" "yes,no" (const_string "no"))
 
 ; LENGTH of an instruction (in bytes)
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-11-05 20:51   ` [PATCH 2/4][AArch64] Increase the loop peeling limit Evandro Menezes
@ 2015-11-19 22:04     ` Evandro Menezes
  2015-11-20 11:53       ` James Greenhalgh
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-19 22:04 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/05/2015 02:51 PM, Evandro Menezes wrote:
> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>
>        * config/aarch64/aarch64.c (aarch64_override_options_internal):
>        Increase loop peeling limit.
>
> This patch increases the limit for the number of peeled insns. With 
> this change, I noticed no major regression in either Geekbench v3 or 
> SPEC CPU2000 while some benchmarks, typically FP ones, improved 
> significantly.
>
> I tested this tuning on Exynos M1 and on A57.  ThunderX seems to 
> benefit from this tuning too.  However, I'd appreciate comments from 
> other stakeholders.

Ping.

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-12 17:32             ` Evandro Menezes
@ 2015-11-19 22:05               ` Evandro Menezes
  2015-11-20 12:27               ` James Greenhalgh
  1 sibling, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-19 22:05 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/12/2015 11:32 AM, Evandro Menezes wrote:
> On 11/12/2015 09:39 AM, Evandro Menezes wrote:
>> On 11/12/2015 08:55 AM, James Greenhalgh wrote:
>>> On Tue, Nov 10, 2015 at 11:50:12AM -0600, Evandro Menezes wrote:
>>>>     2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>>>
>>>>     gcc/
>>>>
>>>>         * config/aarch64/aarch64.md (predicated): Copy attribute from
>>>>         "arm.md".
>>>>
>>>> This patch duplicates an attribute from arm.md so that the same
>>>> pipeline model can be used for both AArch32 and AArch64.
>>>>
>>>> Bootstrapped on arm-unknown-linux-gnueabihf, 
>>>> aarch64-unknown-linux-gnu.
>>>>
>>>> Please, commit if it's alright.
>>>>
>>>> -- 
>>>> Evandro Menezes
>>>>
>>>>
>>>>  From 3b643a3c026350864713e1700dc44e4794d93809 Mon Sep 17 00:00:00 
>>>> 2001
>>>> From: Evandro Menezes <e.menezes@samsung.com>
>>>> Date: Mon, 9 Nov 2015 17:11:16 -0600
>>>> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with 
>>>> ARM
>>>>   pipeline models
>>>>
>>>> gcc/
>>>>     * config/aarch64/aarch64.md (predicated): Copy attribute from 
>>>> "arm.md".
>>>> ---
>>>>   gcc/config/aarch64/aarch64.md | 5 +++++
>>>>   1 file changed, 5 insertions(+)
>>>>
>>>> diff --git a/gcc/config/aarch64/aarch64.md 
>>>> b/gcc/config/aarch64/aarch64.md
>>>> index 6b08850..2bc2ff5 100644
>>>> --- a/gcc/config/aarch64/aarch64.md
>>>> +++ b/gcc/config/aarch64/aarch64.md
>>>> @@ -195,6 +195,11 @@
>>>>   ;; 1 :=: yes
>>>>   (define_attr "far_branch" "" (const_int 0))
>>>>   +;; [For compatibility with ARM in pipeline models]
>>>> +;; Attribute that specifies whether or not the instruction is 
>>>> executed
>>>> +;; conditionally (<C> != "AL"? "yes": "no").
>>> I'm not sure this <C> != "AL" [...] part makes sense to me (thinking 
>>> only
>>> of AArch64, I'd understand it on AArch32 :) ) and we should document 
>>> that
>>> this is never set for AArch64. Could you respin with a slightly clearer
>>> comment.
>> Since this attribute was not described in config/arm/arm.md, I felt 
>> that it needed to, but perhaps in its original place instead.  I 
>> agree that I should also point out that it's strictly for 
>> compatibility with AArch32 and that it never matters for AArch64.
>>
>> WRT the <C> thing, I was referring to the opcode fields terminology 
>> used by ARM in its ISA documentation.  Perhaps it's unnecessary, yes?
>>
>
>    2015-11-12  Evandro Menezes <e.menezes@samsung.com>
>
>    [AArch64] Add attribute for compatibility with ARM pipeline models
>
>    gcc/
>
>        * config/aarch64/aarch64.md (predicated): Copy attribute from
>        "arm.md".
>        * config/arm/arm.md (predicated): Added description.
>
> Please, commit if it's alright.

Ping.

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-11-10 17:54       ` [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
@ 2015-11-19 22:06         ` Evandro Menezes
  2015-11-20 17:17         ` James Greenhalgh
  1 sibling, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-19 22:06 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/10/2015 11:54 AM, Evandro Menezes wrote:
> 2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>
>        * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>        * config/aarch64/aarch64.md: Include "exynos-m1.md".
>        * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>        * config/arm/arm.md: Include "exynos-m1.md".
>        * config/arm/arm-tune.md: Regenerated.
>        * config/arm/exynos-m1.md: New file.
>
> This patch adds the scheduling model for Exynos M1.  It depends on 
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01257.html
>
> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>
> Please, commit if it's alright.

Ping

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 4/4][AArch64] Add cost model for Exynos M1
  2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
@ 2015-11-19 22:06     ` Evandro Menezes
  2015-11-20 17:19       ` James Greenhalgh
  2015-11-24  9:56     ` Kyrill Tkachov
  2015-12-03 20:49     ` Evandro Menezes
  2 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-19 22:06 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/05/2015 06:09 PM, Evandro Menezes wrote:
> 2015-10-25  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>
>        * config/aarch64/aarch64-cores.def: Use the Exynos M1 cost model.
>        * config/aarch64/aarch64.c (exynosm1_addrcost_table): New 
> variable.
>        (exynosm1_regmove_cost): Likewise.
>        (exynosm1_vector_cost): Likewise.
>        (exynosm1_tunings): Likewise.
>        * config/arm/aarch-cost-tables.h (exynosm1_extra_costs): Likewise.
>        * config/arm/arm.c (arm_exynos_m1_tune): Likewise.
>
> This patch adds the cost model for Exynos M1.  This patch depends on a 
> couple of previous patches though, 
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00505.html and 
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00538.html
>
> Please, commit if it's alright.

Ping.

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-11-19 22:04     ` Evandro Menezes
@ 2015-11-20 11:53       ` James Greenhalgh
  2015-12-03 21:07         ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: James Greenhalgh @ 2015-11-20 11:53 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, richard.earnshaw, ramana.radhakrishnan

On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
> >2015-11-05  Evandro Menezes <e.menezes@samsung.com>
> >
> >   gcc/
> >
> >       * config/aarch64/aarch64.c (aarch64_override_options_internal):
> >       Increase loop peeling limit.
> >
> >This patch increases the limit for the number of peeled insns.
> >With this change, I noticed no major regression in either
> >Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
> >ones, improved significantly.
> >
> >I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
> >benefit from this tuning too.  However, I'd appreciate comments
> >from other stakeholders.
> 
> Ping.

I'd like to leave this for a call from the port maintainers. I can see why
this leads to more opportunities for vectorization, but I'm concerned about
the wider impact on code size. Certainly I wouldn't expect this to be our
default at -O2 and below.

My gut feeling is that this doesn't really belong in the back-end (there are
presumably good reasons why the default for this parameter across GCC has
fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
like Marcus or Richard to make the call as to whether or not we take this
patch.

For now, I'd drop it from the series (it stands alone anyway).

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-12 17:32             ` Evandro Menezes
  2015-11-19 22:05               ` Evandro Menezes
@ 2015-11-20 12:27               ` James Greenhalgh
  2015-11-20 14:34                 ` Kyrill Tkachov
  2015-11-20 15:55                 ` Evandro Menezes
  1 sibling, 2 replies; 52+ messages in thread
From: James Greenhalgh @ 2015-11-20 12:27 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On Thu, Nov 12, 2015 at 11:32:36AM -0600, Evandro Menezes wrote:
> On 11/12/2015 09:39 AM, Evandro Menezes wrote:
>    2015-11-12  Evandro Menezes <e.menezes@samsung.com>
> 
>    [AArch64] Add attribute for compatibility with ARM pipeline models
> 
>    gcc/
> 
>        * config/aarch64/aarch64.md (predicated): Copy attribute from
>        "arm.md".
>        * config/arm/arm.md (predicated): Added description.
> 
> Please, commit if it's alright.

The AArch64 part of this is OK.

> From 3fa6a2bca8f3d2992b4607cff0afcc2d9caa96f4 Mon Sep 17 00:00:00 2001
> From: Evandro Menezes <e.menezes@samsung.com>
> Date: Mon, 9 Nov 2015 17:11:16 -0600
> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>  pipeline models
> 
> gcc/
> 	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
> 	* config/arm/arm.md (predicated): Added description.
> ---
>  gcc/config/aarch64/aarch64.md | 4 ++++
>  gcc/config/arm/arm.md         | 3 +++
>  2 files changed, 7 insertions(+)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 1586256..d46f837 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -195,6 +195,10 @@
>  ;; 1 :=: yes
>  (define_attr "far_branch" "" (const_int 0))
>  
> +;; Strictly for compatibility with AArch32 in pipeline models, since AArch64 has
> +;; no predicated insns.
> +(define_attr "predicated" "yes,no" (const_string "no"))
> +
>  ;; -------------------------------------------------------------------
>  ;; Pipeline descriptions and scheduling
>  ;; -------------------------------------------------------------------
> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
> index 73c3088..6bda491 100644
> --- a/gcc/config/arm/arm.md
> +++ b/gcc/config/arm/arm.md
> @@ -105,6 +105,9 @@
>  (define_attr "fpu" "none,vfp"
>    (const (symbol_ref "arm_fpu_attr")))
>  
> +; Predicated means that the insn form is conditionally executed based on a
> +; predicate.  We default to 'no' because no Thumb patterns match this rule
> +; and not all ARM insns do.

s/is conditionally executed/can be conditionally executed/ in the first
sentence. Otherwise, this looks OK to me but I can't approve the ARM part,
so you'll need to wait for a review from someone who can.

Thanks,
James

>  (define_attr "predicated" "yes,no" (const_string "no"))
>  
>  ; LENGTH of an instruction (in bytes)
> -- 
> 2.1.0.243.g30d45f7
> 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-20 12:27               ` James Greenhalgh
@ 2015-11-20 14:34                 ` Kyrill Tkachov
  2015-11-20 15:56                   ` Evandro Menezes
  2015-11-20 15:55                 ` Evandro Menezes
  1 sibling, 1 reply; 52+ messages in thread
From: Kyrill Tkachov @ 2015-11-20 14:34 UTC (permalink / raw)
  To: James Greenhalgh, Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft', Andrew Pinski


On 20/11/15 12:27, James Greenhalgh wrote:
> On Thu, Nov 12, 2015 at 11:32:36AM -0600, Evandro Menezes wrote:
>> On 11/12/2015 09:39 AM, Evandro Menezes wrote:
>>     2015-11-12  Evandro Menezes <e.menezes@samsung.com>
>>
>>     [AArch64] Add attribute for compatibility with ARM pipeline models
>>
>>     gcc/
>>
>>         * config/aarch64/aarch64.md (predicated): Copy attribute from
>>         "arm.md".
>>         * config/arm/arm.md (predicated): Added description.

The arm part is ok too. It's just a comment.
In the ChangeLog entry for arm.md I'd say "Add description."

Thanks,
Kyrill

>> Please, commit if it's alright.
> The AArch64 part of this is OK.
>
>>  From 3fa6a2bca8f3d2992b4607cff0afcc2d9caa96f4 Mon Sep 17 00:00:00 2001
>> From: Evandro Menezes <e.menezes@samsung.com>
>> Date: Mon, 9 Nov 2015 17:11:16 -0600
>> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>>   pipeline models
>>
>> gcc/
>> 	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
>> 	* config/arm/arm.md (predicated): Added description.
>> ---
>>   gcc/config/aarch64/aarch64.md | 4 ++++
>>   gcc/config/arm/arm.md         | 3 +++
>>   2 files changed, 7 insertions(+)
>>
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 1586256..d46f837 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -195,6 +195,10 @@
>>   ;; 1 :=: yes
>>   (define_attr "far_branch" "" (const_int 0))
>>   
>> +;; Strictly for compatibility with AArch32 in pipeline models, since AArch64 has
>> +;; no predicated insns.
>> +(define_attr "predicated" "yes,no" (const_string "no"))
>> +
>>   ;; -------------------------------------------------------------------
>>   ;; Pipeline descriptions and scheduling
>>   ;; -------------------------------------------------------------------
>> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
>> index 73c3088..6bda491 100644
>> --- a/gcc/config/arm/arm.md
>> +++ b/gcc/config/arm/arm.md
>> @@ -105,6 +105,9 @@
>>   (define_attr "fpu" "none,vfp"
>>     (const (symbol_ref "arm_fpu_attr")))
>>   
>> +; Predicated means that the insn form is conditionally executed based on a
>> +; predicate.  We default to 'no' because no Thumb patterns match this rule
>> +; and not all ARM insns do.
> s/is conditionally executed/can be conditionally executed/ in the first
> sentence. Otherwise, this looks OK to me but I can't approve the ARM part,
> so you'll need to wait for a review from someone who can.
>
> Thanks,
> James
>
>>   (define_attr "predicated" "yes,no" (const_string "no"))
>>   
>>   ; LENGTH of an instruction (in bytes)
>> -- 
>> 2.1.0.243.g30d45f7
>>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-20 12:27               ` James Greenhalgh
  2015-11-20 14:34                 ` Kyrill Tkachov
@ 2015-11-20 15:55                 ` Evandro Menezes
  2015-11-20 16:16                   ` James Greenhalgh
  1 sibling, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-11-20 15:55 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/20/2015 06:27 AM, James Greenhalgh wrote:
> On Thu, Nov 12, 2015 at 11:32:36AM -0600, Evandro Menezes wrote:
>> On 11/12/2015 09:39 AM, Evandro Menezes wrote:
>>     2015-11-12  Evandro Menezes <e.menezes@samsung.com>
>>
>>     [AArch64] Add attribute for compatibility with ARM pipeline models
>>
>>     gcc/
>>
>>         * config/aarch64/aarch64.md (predicated): Copy attribute from
>>         "arm.md".
>>         * config/arm/arm.md (predicated): Added description.
>>
>> Please, commit if it's alright.
> The AArch64 part of this is OK.
>
>>  From 3fa6a2bca8f3d2992b4607cff0afcc2d9caa96f4 Mon Sep 17 00:00:00 2001
>> From: Evandro Menezes <e.menezes@samsung.com>
>> Date: Mon, 9 Nov 2015 17:11:16 -0600
>> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>>   pipeline models
>>
>> gcc/
>> 	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
>> 	* config/arm/arm.md (predicated): Added description.
>> ---
>>   gcc/config/aarch64/aarch64.md | 4 ++++
>>   gcc/config/arm/arm.md         | 3 +++
>>   2 files changed, 7 insertions(+)
>>
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 1586256..d46f837 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -195,6 +195,10 @@
>>   ;; 1 :=: yes
>>   (define_attr "far_branch" "" (const_int 0))
>>   
>> +;; Strictly for compatibility with AArch32 in pipeline models, since AArch64 has
>> +;; no predicated insns.
>> +(define_attr "predicated" "yes,no" (const_string "no"))
>> +
>>   ;; -------------------------------------------------------------------
>>   ;; Pipeline descriptions and scheduling
>>   ;; -------------------------------------------------------------------
>> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
>> index 73c3088..6bda491 100644
>> --- a/gcc/config/arm/arm.md
>> +++ b/gcc/config/arm/arm.md
>> @@ -105,6 +105,9 @@
>>   (define_attr "fpu" "none,vfp"
>>     (const (symbol_ref "arm_fpu_attr")))
>>   
>> +; Predicated means that the insn form is conditionally executed based on a
>> +; predicate.  We default to 'no' because no Thumb patterns match this rule
>> +; and not all ARM insns do.
> s/is conditionally executed/can be conditionally executed/ in the first
> sentence. Otherwise, this looks OK to me but I can't approve the ARM part,
> so you'll need to wait for a review from someone who can.
>
>>   (define_attr "predicated" "yes,no" (const_string "no"))
>>   
>>   ; LENGTH of an instruction (in bytes)
>> -- 
>> 2.1.0.243.g30d45f7

Actually, that would then be the existing description for the 
"predicable" attribute.  I think that this proposed description is 
correct for the "predicated" attribute.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-20 14:34                 ` Kyrill Tkachov
@ 2015-11-20 15:56                   ` Evandro Menezes
  0 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-20 15:56 UTC (permalink / raw)
  To: Kyrill Tkachov, James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft', Andrew Pinski

On 11/20/2015 08:34 AM, Kyrill Tkachov wrote:
>
> On 20/11/15 12:27, James Greenhalgh wrote:
>> On Thu, Nov 12, 2015 at 11:32:36AM -0600, Evandro Menezes wrote:
>>> On 11/12/2015 09:39 AM, Evandro Menezes wrote:
>>>     2015-11-12  Evandro Menezes <e.menezes@samsung.com>
>>>
>>>     [AArch64] Add attribute for compatibility with ARM pipeline models
>>>
>>>     gcc/
>>>
>>>         * config/aarch64/aarch64.md (predicated): Copy attribute from
>>>         "arm.md".
>>>         * config/arm/arm.md (predicated): Added description.
>
> The arm part is ok too. It's just a comment.
> In the ChangeLog entry for arm.md I'd say "Add description."
>
> Thanks,
> Kyrill
>
>>> Please, commit if it's alright.
>> The AArch64 part of this is OK.
>>
>>>  From 3fa6a2bca8f3d2992b4607cff0afcc2d9caa96f4 Mon Sep 17 00:00:00 2001
>>> From: Evandro Menezes <e.menezes@samsung.com>
>>> Date: Mon, 9 Nov 2015 17:11:16 -0600
>>> Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
>>>   pipeline models
>>>
>>> gcc/
>>>     * config/aarch64/aarch64.md (predicated): Copy attribute from 
>>> "arm.md".
>>>     * config/arm/arm.md (predicated): Added description.
>>> ---
>>>   gcc/config/aarch64/aarch64.md | 4 ++++
>>>   gcc/config/arm/arm.md         | 3 +++
>>>   2 files changed, 7 insertions(+)
>>>
>>> diff --git a/gcc/config/aarch64/aarch64.md 
>>> b/gcc/config/aarch64/aarch64.md
>>> index 1586256..d46f837 100644
>>> --- a/gcc/config/aarch64/aarch64.md
>>> +++ b/gcc/config/aarch64/aarch64.md
>>> @@ -195,6 +195,10 @@
>>>   ;; 1 :=: yes
>>>   (define_attr "far_branch" "" (const_int 0))
>>>   +;; Strictly for compatibility with AArch32 in pipeline models, 
>>> since AArch64 has
>>> +;; no predicated insns.
>>> +(define_attr "predicated" "yes,no" (const_string "no"))
>>> +
>>>   ;; 
>>> -------------------------------------------------------------------
>>>   ;; Pipeline descriptions and scheduling
>>>   ;; 
>>> -------------------------------------------------------------------
>>> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
>>> index 73c3088..6bda491 100644
>>> --- a/gcc/config/arm/arm.md
>>> +++ b/gcc/config/arm/arm.md
>>> @@ -105,6 +105,9 @@
>>>   (define_attr "fpu" "none,vfp"
>>>     (const (symbol_ref "arm_fpu_attr")))
>>>   +; Predicated means that the insn form is conditionally executed 
>>> based on a
>>> +; predicate.  We default to 'no' because no Thumb patterns match 
>>> this rule
>>> +; and not all ARM insns do.
>> s/is conditionally executed/can be conditionally executed/ in the first
>> sentence. Otherwise, this looks OK to me but I can't approve the ARM 
>> part,
>> so you'll need to wait for a review from someone who can.
>>
>> Thanks,
>> James
>>
>>>   (define_attr "predicated" "yes,no" (const_string "no"))
>>>     ; LENGTH of an instruction (in bytes)
>>> -- 
>>> 2.1.0.243.g30d45f7

Can you please commit it with this change in the Changelog?

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models
  2015-11-20 15:55                 ` Evandro Menezes
@ 2015-11-20 16:16                   ` James Greenhalgh
  0 siblings, 0 replies; 52+ messages in thread
From: James Greenhalgh @ 2015-11-20 16:16 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On Fri, Nov 20, 2015 at 09:55:29AM -0600, Evandro Menezes wrote:
> On 11/20/2015 06:27 AM, James Greenhalgh wrote:
> >On Thu, Nov 12, 2015 at 11:32:36AM -0600, Evandro Menezes wrote:
> >>On 11/12/2015 09:39 AM, Evandro Menezes wrote:
> >>    2015-11-12  Evandro Menezes <e.menezes@samsung.com>
> >>
> >>    [AArch64] Add attribute for compatibility with ARM pipeline models
> >>
> >>    gcc/
> >>
> >>        * config/aarch64/aarch64.md (predicated): Copy attribute from
> >>        "arm.md".
> >>        * config/arm/arm.md (predicated): Added description.
> >>
> >>Please, commit if it's alright.
> >The AArch64 part of this is OK.
> >
> >> From 3fa6a2bca8f3d2992b4607cff0afcc2d9caa96f4 Mon Sep 17 00:00:00 2001
> >>From: Evandro Menezes <e.menezes@samsung.com>
> >>Date: Mon, 9 Nov 2015 17:11:16 -0600
> >>Subject: [PATCH 1/2] [AArch64] Add attribute for compatibility with ARM
> >>  pipeline models
> >>
> >>gcc/
> >>	* config/aarch64/aarch64.md (predicated): Copy attribute from "arm.md".
> >>	* config/arm/arm.md (predicated): Added description.
> >>---
> >>  gcc/config/aarch64/aarch64.md | 4 ++++
> >>  gcc/config/arm/arm.md         | 3 +++
> >>  2 files changed, 7 insertions(+)
> >>
> >>diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> >>index 1586256..d46f837 100644
> >>--- a/gcc/config/aarch64/aarch64.md
> >>+++ b/gcc/config/aarch64/aarch64.md
> >>@@ -195,6 +195,10 @@
> >>  ;; 1 :=: yes
> >>  (define_attr "far_branch" "" (const_int 0))
> >>+;; Strictly for compatibility with AArch32 in pipeline models, since AArch64 has
> >>+;; no predicated insns.
> >>+(define_attr "predicated" "yes,no" (const_string "no"))
> >>+
> >>  ;; -------------------------------------------------------------------
> >>  ;; Pipeline descriptions and scheduling
> >>  ;; -------------------------------------------------------------------
> >>diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
> >>index 73c3088..6bda491 100644
> >>--- a/gcc/config/arm/arm.md
> >>+++ b/gcc/config/arm/arm.md
> >>@@ -105,6 +105,9 @@
> >>  (define_attr "fpu" "none,vfp"
> >>    (const (symbol_ref "arm_fpu_attr")))
> >>+; Predicated means that the insn form is conditionally executed based on a
> >>+; predicate.  We default to 'no' because no Thumb patterns match this rule
> >>+; and not all ARM insns do.
> >s/is conditionally executed/can be conditionally executed/ in the first
> >sentence. Otherwise, this looks OK to me but I can't approve the ARM part,
> >so you'll need to wait for a review from someone who can.
> >
> >>  (define_attr "predicated" "yes,no" (const_string "no"))
> >>  ; LENGTH of an instruction (in bytes)
> >>-- 
> >>2.1.0.243.g30d45f7
> 
> Actually, that would then be the existing description for the
> "predicable" attribute.  I think that this proposed description is
> correct for the "predicated" attribute.

Right, understood, that will teach me to pattern match up to pred and stop
reading the word.

This is fine, I've committed it on your behalf as r230666

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-11-10 17:54       ` [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
  2015-11-19 22:06         ` Evandro Menezes
@ 2015-11-20 17:17         ` James Greenhalgh
  2015-11-20 22:07           ` Evandro Menezes
  2015-12-03 20:58           ` Evandro Menezes
  1 sibling, 2 replies; 52+ messages in thread
From: James Greenhalgh @ 2015-11-20 17:17 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, Ramana Radhakrishnan

On Tue, Nov 10, 2015 at 11:54:00AM -0600, Evandro Menezes wrote:
>    2015-11-10  Evandro Menezes <e.menezes@samsung.com>
> 
>    gcc/
> 
>        * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>        * config/aarch64/aarch64.md: Include "exynos-m1.md".
>        * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>        * config/arm/arm.md: Include "exynos-m1.md".
>        * config/arm/arm-tune.md: Regenerated.
>        * config/arm/exynos-m1.md: New file.
> 
> This patch adds the scheduling model for Exynos M1.  It depends on
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01257.html
> 
> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
> 
> Please, commit if it's alright.


> From 0b7b6d597e5877c78c4d88e0d4491858555a5364 Mon Sep 17 00:00:00 2001
> From: Evandro Menezes <e.menezes@samsung.com>
> Date: Mon, 9 Nov 2015 17:18:52 -0600
> Subject: [PATCH 2/2] [AArch64] Add scheduling model for Exynos M1
> 
> gcc/
> 	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
> 	* config/aarch64/aarch64.md: Include "exynos-m1.md".

These changes are fine.

> 	* config/arm/arm-cores.def: Use the Exynos M1 sched model.
> 	* config/arm/arm.md: Include "exynos-m1.md".
> 	* config/arm/arm-tune.md: Regenerated.

These changes need an ack from an ARM reviewer.

> 	* config/arm/exynos-m1.md: New file.

I have a few comments on this model.

> +;; The Exynos M1 core is modeled as a triple issue pipeline that has
> +;; the following functional units.
> +
> +(define_automaton "exynos_m1_gp")
> +(define_automaton "exynos_m1_ls")
> +(define_automaton "exynos_m1_fp")
> +
> +;; 1.  Two pipelines for simple integer operations: A, B
> +;; 2.  One pipeline for simple or complex integer operations: C
> +
> +(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
> +
> +(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
> +(define_reservation "em1_c" "em1_xc")

Is this extra reservation useful, can we not just use em1_xc directly?

> +;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
> +
> +(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
> +
> +(define_reservation "em1_fmac" "em1_f0")
> +(define_reservation "em1_fcvt" "em1_f0")
> +(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
> +(define_reservation "em1_nalu0" "em1_f0")
> +(define_reservation "em1_nalu1" "em1_f1")
> +(define_reservation "em1_nmisc" "em1_f0")
> +(define_reservation "em1_ncrypt" "em1_f0")
> +(define_reservation "em1_fadd" "em1_f1")
> +(define_reservation "em1_fvar" "em1_f1")
> +(define_reservation "em1_fst" "em1_f1")

Same comment here, does this not just obfuscate the interaction between
instruction classes in the description. I'm not against doing it this way
if you prefer, but it would seem to reduce readability to me. I think there
is also an argument that this increases readability, so it is your choice.

> +
> +;; 4.  One pipeline for branch operations: BX
> +
> +(define_cpu_unit "em1_bx" "exynos_m1_gp")
> +
> +(define_reservation "em1_br" "em1_bx")
> +

And again?

> +;; 5.  One AGU for loads: L
> +;;     One AGU for stores and one pipeline for stores: S, SD
> +
> +(define_cpu_unit "em1_lx" "exynos_m1_ls")
> +(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
> +
> +(define_reservation "em1_ld" "em1_lx")
> +(define_reservation "em1_st" "(em1_sx + em1_sd)")
> +
> +;; Common occurrences
> +(define_reservation "em1_sfst" "(em1_fst + em1_st)")
> +(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
> +
> +;; Branches
> +;;
> +;; No latency as there is no result
> +;; TODO: Unconditional branches use no units;
> +;; conditional branches add the BX unit;
> +;; indirect branches add the C unit.
> +(define_insn_reservation "exynos_m1_branch" 0
> +  (and (eq_attr "tune" "exynosm1")
> +       (eq_attr "type" "branch"))
> +  "em1_br")
> +
> +(define_insn_reservation "exynos_m1_call" 1
> +  (and (eq_attr "tune" "exynosm1")
> +       (eq_attr "type" "call"))
> +  "em1_alu")
> +
> +;; Basic ALU
> +;;
> +;; Simple ALU without shift, non-predicated
> +(define_insn_reservation "exynos_m1_alu" 1
> +  (and (eq_attr "tune" "exynosm1")
> +       (and (not (eq_attr "predicated" "yes"))

(and (eq_attr "predicated" "no")) ?

Likewise throughout the file? Again this is your choice.

This is OK from the AArch64 side, let me know if you plan to change any
of the above, otherwise I'll commit it (or someone else can commit it)
after I see an OK from an ARM reviewer.

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 4/4][AArch64] Add cost model for Exynos M1
  2015-11-19 22:06     ` Evandro Menezes
@ 2015-11-20 17:19       ` James Greenhalgh
  0 siblings, 0 replies; 52+ messages in thread
From: James Greenhalgh @ 2015-11-20 17:19 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski

On Thu, Nov 19, 2015 at 04:06:17PM -0600, Evandro Menezes wrote:
> On 11/05/2015 06:09 PM, Evandro Menezes wrote:
> >2015-10-25  Evandro Menezes <e.menezes@samsung.com>
> >
> >   gcc/
> >
> >       * config/aarch64/aarch64-cores.def: Use the Exynos M1 cost model.
> >       * config/aarch64/aarch64.c (exynosm1_addrcost_table): New
> >variable.
> >       (exynosm1_regmove_cost): Likewise.
> >       (exynosm1_vector_cost): Likewise.
> >       (exynosm1_tunings): Likewise.
> >       * config/arm/aarch-cost-tables.h (exynosm1_extra_costs): Likewise.
> >       * config/arm/arm.c (arm_exynos_m1_tune): Likewise.
> >
> >This patch adds the cost model for Exynos M1.  This patch depends
> >on a couple of previous patches though,
> >https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00505.html and
> >https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00538.html
> >
> >Please, commit if it's alright.
> 
> Ping.

This is OK from an AArch64 perspective. Please wait for an OK from an
ARM reviewer.

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-11-20 17:17         ` James Greenhalgh
@ 2015-11-20 22:07           ` Evandro Menezes
  2015-12-03 20:58           ` Evandro Menezes
  1 sibling, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-11-20 22:07 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, Ramana Radhakrishnan

On 11/20/2015 11:17 AM, James Greenhalgh wrote:
> On Tue, Nov 10, 2015 at 11:54:00AM -0600, Evandro Menezes wrote:
>>     2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>
>>     gcc/
>>
>>         * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>>         * config/aarch64/aarch64.md: Include "exynos-m1.md".
>>         * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>>         * config/arm/arm.md: Include "exynos-m1.md".
>>         * config/arm/arm-tune.md: Regenerated.
>>         * config/arm/exynos-m1.md: New file.
>>
>> This patch adds the scheduling model for Exynos M1.  It depends on
>> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01257.html
>>
>> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>>
>> Please, commit if it's alright.
>
>>  From 0b7b6d597e5877c78c4d88e0d4491858555a5364 Mon Sep 17 00:00:00 2001
>> From: Evandro Menezes <e.menezes@samsung.com>
>> Date: Mon, 9 Nov 2015 17:18:52 -0600
>> Subject: [PATCH 2/2] [AArch64] Add scheduling model for Exynos M1
>>
>> gcc/
>> 	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>> 	* config/aarch64/aarch64.md: Include "exynos-m1.md".
> These changes are fine.
>
>> 	* config/arm/arm-cores.def: Use the Exynos M1 sched model.
>> 	* config/arm/arm.md: Include "exynos-m1.md".
>> 	* config/arm/arm-tune.md: Regenerated.
> These changes need an ack from an ARM reviewer.
>
>> 	* config/arm/exynos-m1.md: New file.
> I have a few comments on this model.
>
>> +;; The Exynos M1 core is modeled as a triple issue pipeline that has
>> +;; the following functional units.
>> +
>> +(define_automaton "exynos_m1_gp")
>> +(define_automaton "exynos_m1_ls")
>> +(define_automaton "exynos_m1_fp")
>> +
>> +;; 1.  Two pipelines for simple integer operations: A, B
>> +;; 2.  One pipeline for simple or complex integer operations: C
>> +
>> +(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
>> +
>> +(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
>> +(define_reservation "em1_c" "em1_xc")
> Is this extra reservation useful, can we not just use em1_xc directly?
>
>> +;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
>> +
>> +(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
>> +
>> +(define_reservation "em1_fmac" "em1_f0")
>> +(define_reservation "em1_fcvt" "em1_f0")
>> +(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
>> +(define_reservation "em1_nalu0" "em1_f0")
>> +(define_reservation "em1_nalu1" "em1_f1")
>> +(define_reservation "em1_nmisc" "em1_f0")
>> +(define_reservation "em1_ncrypt" "em1_f0")
>> +(define_reservation "em1_fadd" "em1_f1")
>> +(define_reservation "em1_fvar" "em1_f1")
>> +(define_reservation "em1_fst" "em1_f1")
> Same comment here, does this not just obfuscate the interaction between
> instruction classes in the description. I'm not against doing it this way
> if you prefer, but it would seem to reduce readability to me. I think there
> is also an argument that this increases readability, so it is your choice.
>
>> +
>> +;; 4.  One pipeline for branch operations: BX
>> +
>> +(define_cpu_unit "em1_bx" "exynos_m1_gp")
>> +
>> +(define_reservation "em1_br" "em1_bx")
>> +
> And again?
>
>> +;; 5.  One AGU for loads: L
>> +;;     One AGU for stores and one pipeline for stores: S, SD
>> +
>> +(define_cpu_unit "em1_lx" "exynos_m1_ls")
>> +(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
>> +
>> +(define_reservation "em1_ld" "em1_lx")
>> +(define_reservation "em1_st" "(em1_sx + em1_sd)")
>> +
>> +;; Common occurrences
>> +(define_reservation "em1_sfst" "(em1_fst + em1_st)")
>> +(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
>> +
>> +;; Branches
>> +;;
>> +;; No latency as there is no result
>> +;; TODO: Unconditional branches use no units;
>> +;; conditional branches add the BX unit;
>> +;; indirect branches add the C unit.
>> +(define_insn_reservation "exynos_m1_branch" 0
>> +  (and (eq_attr "tune" "exynosm1")
>> +       (eq_attr "type" "branch"))
>> +  "em1_br")
>> +
>> +(define_insn_reservation "exynos_m1_call" 1
>> +  (and (eq_attr "tune" "exynosm1")
>> +       (eq_attr "type" "call"))
>> +  "em1_alu")
>> +
>> +;; Basic ALU
>> +;;
>> +;; Simple ALU without shift, non-predicated
>> +(define_insn_reservation "exynos_m1_alu" 1
>> +  (and (eq_attr "tune" "exynosm1")
>> +       (and (not (eq_attr "predicated" "yes"))
> (and (eq_attr "predicated" "no")) ?
>
> Likewise throughout the file? Again this is your choice.
>
> This is OK from the AArch64 side, let me know if you plan to change any
> of the above, otherwise I'll commit it (or someone else can commit it)
> after I see an OK from an ARM reviewer.

The naming choices were made more for uniformity's sake and to conform 
with the processor documentation.

The bit about "not... yes" <=> "no" is a goof up. :-s

I'm waiting for Kyrill's patch adding FCCMP to be approved to amend this 
patch.  However, I'd appreciate if it'd be approved and committed before 
then, as soon the ARM stuff is okayed.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 4/4][AArch64] Add cost model for Exynos M1
  2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
  2015-11-19 22:06     ` Evandro Menezes
@ 2015-11-24  9:56     ` Kyrill Tkachov
  2015-12-03 20:49     ` Evandro Menezes
  2 siblings, 0 replies; 52+ messages in thread
From: Kyrill Tkachov @ 2015-11-24  9:56 UTC (permalink / raw)
  To: Evandro Menezes, 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh', Andrew Pinski


On 06/11/15 00:09, Evandro Menezes wrote:
> 2015-10-25  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>
>        * config/aarch64/aarch64-cores.def: Use the Exynos M1 cost model.
>        * config/aarch64/aarch64.c (exynosm1_addrcost_table): New variable.
>        (exynosm1_regmove_cost): Likewise.
>        (exynosm1_vector_cost): Likewise.
>        (exynosm1_tunings): Likewise.
>        * config/arm/aarch-cost-tables.h (exynosm1_extra_costs): Likewise.
>        * config/arm/arm.c (arm_exynos_m1_tune): Likewise.
>

This is ok arm-wise.
Thanks,
Kyrill

> This patch adds the cost model for Exynos M1.  This patch depends on a couple of previous patches though, https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00505.html and https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00538.html
>
> Please, commit if it's alright.
>
> Thank you,
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 4/4][AArch64] Add cost model for Exynos M1
  2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
  2015-11-19 22:06     ` Evandro Menezes
  2015-11-24  9:56     ` Kyrill Tkachov
@ 2015-12-03 20:49     ` Evandro Menezes
  2 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-12-03 20:49 UTC (permalink / raw)
  To: 'gcc-patches'
  Cc: 'Marcus Shawcroft', 'James Greenhalgh',
	'Kyrill Tkachov',
	Andrew Pinski

On 11/05/2015 06:09 PM, Evandro Menezes wrote:
> 2015-10-25  Evandro Menezes <e.menezes@samsung.com>
>
>    gcc/
>
>        * config/aarch64/aarch64-cores.def: Use the Exynos M1 cost model.
>        * config/aarch64/aarch64.c (exynosm1_addrcost_table): New 
> variable.
>        (exynosm1_regmove_cost): Likewise.
>        (exynosm1_vector_cost): Likewise.
>        (exynosm1_tunings): Likewise.
>        * config/arm/aarch-cost-tables.h (exynosm1_extra_costs): Likewise.
>        * config/arm/arm.c (arm_exynos_m1_tune): Likewise.
>
> This patch adds the cost model for Exynos M1.  This patch depends on a 
> couple of previous patches though, 
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00505.html and 
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00538.html

Checked in as r231233.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-11-20 17:17         ` James Greenhalgh
  2015-11-20 22:07           ` Evandro Menezes
@ 2015-12-03 20:58           ` Evandro Menezes
  2015-12-04  9:25             ` Kyrill Tkachov
  1 sibling, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-12-03 20:58 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, Ramana Radhakrishnan

On 11/20/2015 11:17 AM, James Greenhalgh wrote:
> On Tue, Nov 10, 2015 at 11:54:00AM -0600, Evandro Menezes wrote:
>>     2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>
>>     gcc/
>>
>>         * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>>         * config/aarch64/aarch64.md: Include "exynos-m1.md".
>>         * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>>         * config/arm/arm.md: Include "exynos-m1.md".
>>         * config/arm/arm-tune.md: Regenerated.
>>         * config/arm/exynos-m1.md: New file.
>>
>> This patch adds the scheduling model for Exynos M1.  It depends on
>> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01257.html
>>
>> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>>
>> Please, commit if it's alright.
>
>>  From 0b7b6d597e5877c78c4d88e0d4491858555a5364 Mon Sep 17 00:00:00 2001
>> From: Evandro Menezes <e.menezes@samsung.com>
>> Date: Mon, 9 Nov 2015 17:18:52 -0600
>> Subject: [PATCH 2/2] [AArch64] Add scheduling model for Exynos M1
>>
>> gcc/
>> 	* config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>> 	* config/aarch64/aarch64.md: Include "exynos-m1.md".
> These changes are fine.
>
>> 	* config/arm/arm-cores.def: Use the Exynos M1 sched model.
>> 	* config/arm/arm.md: Include "exynos-m1.md".
>> 	* config/arm/arm-tune.md: Regenerated.
> These changes need an ack from an ARM reviewer.
>
>> 	* config/arm/exynos-m1.md: New file.
> I have a few comments on this model.
>
>> +;; The Exynos M1 core is modeled as a triple issue pipeline that has
>> +;; the following functional units.
>> +
>> +(define_automaton "exynos_m1_gp")
>> +(define_automaton "exynos_m1_ls")
>> +(define_automaton "exynos_m1_fp")
>> +
>> +;; 1.  Two pipelines for simple integer operations: A, B
>> +;; 2.  One pipeline for simple or complex integer operations: C
>> +
>> +(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
>> +
>> +(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
>> +(define_reservation "em1_c" "em1_xc")
> Is this extra reservation useful, can we not just use em1_xc directly?
>
>> +;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
>> +
>> +(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
>> +
>> +(define_reservation "em1_fmac" "em1_f0")
>> +(define_reservation "em1_fcvt" "em1_f0")
>> +(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
>> +(define_reservation "em1_nalu0" "em1_f0")
>> +(define_reservation "em1_nalu1" "em1_f1")
>> +(define_reservation "em1_nmisc" "em1_f0")
>> +(define_reservation "em1_ncrypt" "em1_f0")
>> +(define_reservation "em1_fadd" "em1_f1")
>> +(define_reservation "em1_fvar" "em1_f1")
>> +(define_reservation "em1_fst" "em1_f1")
> Same comment here, does this not just obfuscate the interaction between
> instruction classes in the description. I'm not against doing it this way
> if you prefer, but it would seem to reduce readability to me. I think there
> is also an argument that this increases readability, so it is your choice.
>
>> +
>> +;; 4.  One pipeline for branch operations: BX
>> +
>> +(define_cpu_unit "em1_bx" "exynos_m1_gp")
>> +
>> +(define_reservation "em1_br" "em1_bx")
>> +
> And again?
>
>> +;; 5.  One AGU for loads: L
>> +;;     One AGU for stores and one pipeline for stores: S, SD
>> +
>> +(define_cpu_unit "em1_lx" "exynos_m1_ls")
>> +(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
>> +
>> +(define_reservation "em1_ld" "em1_lx")
>> +(define_reservation "em1_st" "(em1_sx + em1_sd)")
>> +
>> +;; Common occurrences
>> +(define_reservation "em1_sfst" "(em1_fst + em1_st)")
>> +(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
>> +
>> +;; Branches
>> +;;
>> +;; No latency as there is no result
>> +;; TODO: Unconditional branches use no units;
>> +;; conditional branches add the BX unit;
>> +;; indirect branches add the C unit.
>> +(define_insn_reservation "exynos_m1_branch" 0
>> +  (and (eq_attr "tune" "exynosm1")
>> +       (eq_attr "type" "branch"))
>> +  "em1_br")
>> +
>> +(define_insn_reservation "exynos_m1_call" 1
>> +  (and (eq_attr "tune" "exynosm1")
>> +       (eq_attr "type" "call"))
>> +  "em1_alu")
>> +
>> +;; Basic ALU
>> +;;
>> +;; Simple ALU without shift, non-predicated
>> +(define_insn_reservation "exynos_m1_alu" 1
>> +  (and (eq_attr "tune" "exynosm1")
>> +       (and (not (eq_attr "predicated" "yes"))
> (and (eq_attr "predicated" "no")) ?
>
> Likewise throughout the file? Again this is your choice.
>
> This is OK from the AArch64 side, let me know if you plan to change any
> of the above, otherwise I'll commit it (or someone else can commit it)
> after I see an OK from an ARM reviewer.

ARM ping.

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-11-20 11:53       ` James Greenhalgh
@ 2015-12-03 21:07         ` Evandro Menezes
  2015-12-14 11:26           ` James Greenhalgh
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-12-03 21:07 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, richard.earnshaw, ramana.radhakrishnan

On 11/20/2015 05:53 AM, James Greenhalgh wrote:
> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>
>>>    gcc/
>>>
>>>        * config/aarch64/aarch64.c (aarch64_override_options_internal):
>>>        Increase loop peeling limit.
>>>
>>> This patch increases the limit for the number of peeled insns.
>>> With this change, I noticed no major regression in either
>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>> ones, improved significantly.
>>>
>>> I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
>>> benefit from this tuning too.  However, I'd appreciate comments
>> >from other stakeholders.
>>
>> Ping.
> I'd like to leave this for a call from the port maintainers. I can see why
> this leads to more opportunities for vectorization, but I'm concerned about
> the wider impact on code size. Certainly I wouldn't expect this to be our
> default at -O2 and below.
>
> My gut feeling is that this doesn't really belong in the back-end (there are
> presumably good reasons why the default for this parameter across GCC has
> fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
> like Marcus or Richard to make the call as to whether or not we take this
> patch.

Please, correct me if I'm wrong, but loop peeling is enabled only with 
loop unrolling (and with PGO).  If so, then extra code size is not a 
concern, for this heuristic is only active when unrolling loops, when 
code size is already of secondary importance.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-12-03 20:58           ` Evandro Menezes
@ 2015-12-04  9:25             ` Kyrill Tkachov
  2015-12-07 19:55               ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: Kyrill Tkachov @ 2015-12-04  9:25 UTC (permalink / raw)
  To: Evandro Menezes, James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	Andrew Pinski, Ramana Radhakrishnan

Hi Evandro,

On 03/12/15 20:58, Evandro Menezes wrote:
> On 11/20/2015 11:17 AM, James Greenhalgh wrote:
>> On Tue, Nov 10, 2015 at 11:54:00AM -0600, Evandro Menezes wrote:
>>>     2015-11-10  Evandro Menezes <e.menezes@samsung.com>
>>>
>>>     gcc/
>>>
>>>         * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>>>         * config/aarch64/aarch64.md: Include "exynos-m1.md".
>>>         * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>>>         * config/arm/arm.md: Include "exynos-m1.md".
>>>         * config/arm/arm-tune.md: Regenerated.
>>>         * config/arm/exynos-m1.md: New file.
>>>
>>> This patch adds the scheduling model for Exynos M1.  It depends on
>>> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01257.html
>>>
>>> Bootstrapped on arm-unknown-linux-gnueabihf, aarch64-unknown-linux-gnu.
>>>
>>> Please, commit if it's alright.
>>
>>>  From 0b7b6d597e5877c78c4d88e0d4491858555a5364 Mon Sep 17 00:00:00 2001
>>> From: Evandro Menezes <e.menezes@samsung.com>
>>> Date: Mon, 9 Nov 2015 17:18:52 -0600
>>> Subject: [PATCH 2/2] [AArch64] Add scheduling model for Exynos M1
>>>
>>> gcc/
>>>     * config/aarch64/aarch64-cores.def: Use the Exynos M1 sched model.
>>>     * config/aarch64/aarch64.md: Include "exynos-m1.md".
>> These changes are fine.
>>
>>>     * config/arm/arm-cores.def: Use the Exynos M1 sched model.
>>>     * config/arm/arm.md: Include "exynos-m1.md".
>>>     * config/arm/arm-tune.md: Regenerated.
>> These changes need an ack from an ARM reviewer.
>>
>>>     * config/arm/exynos-m1.md: New file.
>> I have a few comments on this model.
>>
>>> +;; The Exynos M1 core is modeled as a triple issue pipeline that has
>>> +;; the following functional units.
>>> +
>>> +(define_automaton "exynos_m1_gp")
>>> +(define_automaton "exynos_m1_ls")
>>> +(define_automaton "exynos_m1_fp")
>>> +
>>> +;; 1.  Two pipelines for simple integer operations: A, B
>>> +;; 2.  One pipeline for simple or complex integer operations: C
>>> +
>>> +(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
>>> +
>>> +(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
>>> +(define_reservation "em1_c" "em1_xc")
>> Is this extra reservation useful, can we not just use em1_xc directly?
>>
>>> +;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
>>> +
>>> +(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
>>> +
>>> +(define_reservation "em1_fmac" "em1_f0")
>>> +(define_reservation "em1_fcvt" "em1_f0")
>>> +(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
>>> +(define_reservation "em1_nalu0" "em1_f0")
>>> +(define_reservation "em1_nalu1" "em1_f1")
>>> +(define_reservation "em1_nmisc" "em1_f0")
>>> +(define_reservation "em1_ncrypt" "em1_f0")
>>> +(define_reservation "em1_fadd" "em1_f1")
>>> +(define_reservation "em1_fvar" "em1_f1")
>>> +(define_reservation "em1_fst" "em1_f1")
>> Same comment here, does this not just obfuscate the interaction between
>> instruction classes in the description. I'm not against doing it this way
>> if you prefer, but it would seem to reduce readability to me. I think there
>> is also an argument that this increases readability, so it is your choice.
>>
>>> +
>>> +;; 4.  One pipeline for branch operations: BX
>>> +
>>> +(define_cpu_unit "em1_bx" "exynos_m1_gp")
>>> +
>>> +(define_reservation "em1_br" "em1_bx")
>>> +
>> And again?
>>
>>> +;; 5.  One AGU for loads: L
>>> +;;     One AGU for stores and one pipeline for stores: S, SD
>>> +
>>> +(define_cpu_unit "em1_lx" "exynos_m1_ls")
>>> +(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
>>> +
>>> +(define_reservation "em1_ld" "em1_lx")
>>> +(define_reservation "em1_st" "(em1_sx + em1_sd)")
>>> +
>>> +;; Common occurrences
>>> +(define_reservation "em1_sfst" "(em1_fst + em1_st)")
>>> +(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
>>> +
>>> +;; Branches
>>> +;;
>>> +;; No latency as there is no result
>>> +;; TODO: Unconditional branches use no units;
>>> +;; conditional branches add the BX unit;
>>> +;; indirect branches add the C unit.
>>> +(define_insn_reservation "exynos_m1_branch" 0
>>> +  (and (eq_attr "tune" "exynosm1")
>>> +       (eq_attr "type" "branch"))
>>> +  "em1_br")
>>> +
>>> +(define_insn_reservation "exynos_m1_call" 1
>>> +  (and (eq_attr "tune" "exynosm1")
>>> +       (eq_attr "type" "call"))
>>> +  "em1_alu")
>>> +
>>> +;; Basic ALU
>>> +;;
>>> +;; Simple ALU without shift, non-predicated
>>> +(define_insn_reservation "exynos_m1_alu" 1
>>> +  (and (eq_attr "tune" "exynosm1")
>>> +       (and (not (eq_attr "predicated" "yes"))
>> (and (eq_attr "predicated" "no")) ?
>>
>> Likewise throughout the file? Again this is your choice.
>>
>> This is OK from the AArch64 side, let me know if you plan to change any
>> of the above, otherwise I'll commit it (or someone else can commit it)
>> after I see an OK from an ARM reviewer.
>
> ARM ping.
>

This is ok arm-wise, sorry for the delay.
Make sure to regenerate and commit the updated config/arm/arm-tune.md hunk
when committing the patch.

Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1
  2015-12-04  9:25             ` Kyrill Tkachov
@ 2015-12-07 19:55               ` Evandro Menezes
  0 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2015-12-07 19:55 UTC (permalink / raw)
  To: Kyrill Tkachov, James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	Andrew Pinski, Ramana Radhakrishnan

On 12/04/2015 03:25 AM, Kyrill Tkachov wrote:
> This is ok arm-wise, sorry for the delay.
> Make sure to regenerate and commit the updated config/arm/arm-tune.md 
> hunk
> when committing the patch.

Checked in as r231378.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-12-03 21:07         ` Evandro Menezes
@ 2015-12-14 11:26           ` James Greenhalgh
  2015-12-15 23:34             ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: James Greenhalgh @ 2015-12-14 11:26 UTC (permalink / raw)
  To: Evandro Menezes
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, richard.earnshaw, ramana.radhakrishnan

On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
> >On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
> >>On 11/05/2015 02:51 PM, Evandro Menezes wrote:
> >>>2015-11-05  Evandro Menezes <e.menezes@samsung.com>
> >>>
> >>>   gcc/
> >>>
> >>>       * config/aarch64/aarch64.c (aarch64_override_options_internal):
> >>>       Increase loop peeling limit.
> >>>
> >>>This patch increases the limit for the number of peeled insns.
> >>>With this change, I noticed no major regression in either
> >>>Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
> >>>ones, improved significantly.
> >>>
> >>>I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
> >>>benefit from this tuning too.  However, I'd appreciate comments
> >>>from other stakeholders.
> >>
> >>Ping.
> >I'd like to leave this for a call from the port maintainers. I can see why
> >this leads to more opportunities for vectorization, but I'm concerned about
> >the wider impact on code size. Certainly I wouldn't expect this to be our
> >default at -O2 and below.
> >
> >My gut feeling is that this doesn't really belong in the back-end (there are
> >presumably good reasons why the default for this parameter across GCC has
> >fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
> >like Marcus or Richard to make the call as to whether or not we take this
> >patch.
> 
> Please, correct me if I'm wrong, but loop peeling is enabled only
> with loop unrolling (and with PGO).  If so, then extra code size is
> not a concern, for this heuristic is only active when unrolling
> loops, when code size is already of secondary importance.

My understanding was that loop peeling is enabled from -O2 upwards, and
is also used to partially peel unaligned loops for vectorization (allowing
the vector code to be well aligned), or to completely peel inner loops which
may then become amenable to SLP vectorization.

If I'm wrong then I take back these objections. But I was sure this
parameter was used in a number of situations outside of just
-funroll-loops/-funroll-all-loops . Certainly I remember seeing performance
sensitivities to this parameter at -O3 in some internal workloads I was
analysing.

Thanks,
James

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-12-14 11:26           ` James Greenhalgh
@ 2015-12-15 23:34             ` Evandro Menezes
  2015-12-16 11:24               ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-12-15 23:34 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, richard.earnshaw, ramana.radhakrishnan

On 12/14/2015 05:26 AM, James Greenhalgh wrote:
> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>
>>>>>    gcc/
>>>>>
>>>>>        * config/aarch64/aarch64.c (aarch64_override_options_internal):
>>>>>        Increase loop peeling limit.
>>>>>
>>>>> This patch increases the limit for the number of peeled insns.
>>>>> With this change, I noticed no major regression in either
>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>> ones, improved significantly.
>>>>>
>>>>> I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>> >from other stakeholders.
>>>>
>>>> Ping.
>>> I'd like to leave this for a call from the port maintainers. I can see why
>>> this leads to more opportunities for vectorization, but I'm concerned about
>>> the wider impact on code size. Certainly I wouldn't expect this to be our
>>> default at -O2 and below.
>>>
>>> My gut feeling is that this doesn't really belong in the back-end (there are
>>> presumably good reasons why the default for this parameter across GCC has
>>> fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
>>> like Marcus or Richard to make the call as to whether or not we take this
>>> patch.
>> Please, correct me if I'm wrong, but loop peeling is enabled only
>> with loop unrolling (and with PGO).  If so, then extra code size is
>> not a concern, for this heuristic is only active when unrolling
>> loops, when code size is already of secondary importance.
> My understanding was that loop peeling is enabled from -O2 upwards, and
> is also used to partially peel unaligned loops for vectorization (allowing
> the vector code to be well aligned), or to completely peel inner loops which
> may then become amenable to SLP vectorization.
>
> If I'm wrong then I take back these objections. But I was sure this
> parameter was used in a number of situations outside of just
> -funroll-loops/-funroll-all-loops . Certainly I remember seeing performance
> sensitivities to this parameter at -O3 in some internal workloads I was
> analysing.

Vectorization, including SLP, is only enabled at -O3, isn't it?  It 
seems to me that peeling is only used by optimizations which already 
lead to potential increase in code size.

For instance, with "-Ofast -funroll-all-loops", the total text size for 
the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB 
without it; with just "-O2", it is the same at 23.1MB regardless of this 
setting.

So it seems to me that this proposal should be neutral for up to -O2.

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-12-15 23:34             ` Evandro Menezes
@ 2015-12-16 11:24               ` Richard Earnshaw (lists)
  2015-12-16 12:42                 ` Richard Biener
  2015-12-16 20:11                 ` Evandro Menezes
  0 siblings, 2 replies; 52+ messages in thread
From: Richard Earnshaw (lists) @ 2015-12-16 11:24 UTC (permalink / raw)
  To: Evandro Menezes, James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, ramana.radhakrishnan

On 15/12/15 23:34, Evandro Menezes wrote:
> On 12/14/2015 05:26 AM, James Greenhalgh wrote:
>> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>>
>>>>>>    gcc/
>>>>>>
>>>>>>        * config/aarch64/aarch64.c
>>>>>> (aarch64_override_options_internal):
>>>>>>        Increase loop peeling limit.
>>>>>>
>>>>>> This patch increases the limit for the number of peeled insns.
>>>>>> With this change, I noticed no major regression in either
>>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>>> ones, improved significantly.
>>>>>>
>>>>>> I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
>>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>>> >from other stakeholders.
>>>>>
>>>>> Ping.
>>>> I'd like to leave this for a call from the port maintainers. I can
>>>> see why
>>>> this leads to more opportunities for vectorization, but I'm
>>>> concerned about
>>>> the wider impact on code size. Certainly I wouldn't expect this to
>>>> be our
>>>> default at -O2 and below.
>>>>
>>>> My gut feeling is that this doesn't really belong in the back-end
>>>> (there are
>>>> presumably good reasons why the default for this parameter across
>>>> GCC has
>>>> fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
>>>> like Marcus or Richard to make the call as to whether or not we take
>>>> this
>>>> patch.
>>> Please, correct me if I'm wrong, but loop peeling is enabled only
>>> with loop unrolling (and with PGO).  If so, then extra code size is
>>> not a concern, for this heuristic is only active when unrolling
>>> loops, when code size is already of secondary importance.
>> My understanding was that loop peeling is enabled from -O2 upwards, and
>> is also used to partially peel unaligned loops for vectorization
>> (allowing
>> the vector code to be well aligned), or to completely peel inner loops
>> which
>> may then become amenable to SLP vectorization.
>>
>> If I'm wrong then I take back these objections. But I was sure this
>> parameter was used in a number of situations outside of just
>> -funroll-loops/-funroll-all-loops . Certainly I remember seeing
>> performance
>> sensitivities to this parameter at -O3 in some internal workloads I was
>> analysing.
> 
> Vectorization, including SLP, is only enabled at -O3, isn't it?  It
> seems to me that peeling is only used by optimizations which already
> lead to potential increase in code size.
> 
> For instance, with "-Ofast -funroll-all-loops", the total text size for
> the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB
> without it; with just "-O2", it is the same at 23.1MB regardless of this
> setting.
> 
> So it seems to me that this proposal should be neutral for up to -O2.
> 
> Thank you,
> 

My preference would be to not diverge from the global parameter
settings.  I haven't looked in detail at this parameter but it seems to
me there are two possible paths:

1) We could get agreement globally that the parameter should be increased.
2) We could agree that this specific use of the parameter is distinct
from some other uses and deserves a new param in its own right with a
higher value.

R.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-12-16 11:24               ` Richard Earnshaw (lists)
@ 2015-12-16 12:42                 ` Richard Biener
  2015-12-16 20:11                 ` Evandro Menezes
  1 sibling, 0 replies; 52+ messages in thread
From: Richard Biener @ 2015-12-16 12:42 UTC (permalink / raw)
  To: Richard Earnshaw (lists)
  Cc: Evandro Menezes, James Greenhalgh, gcc-patches, Marcus Shawcroft,
	Kyrill Tkachov, Andrew Pinski, Ramana Radhakrishnan

On Wed, Dec 16, 2015 at 12:24 PM, Richard Earnshaw (lists)
<Richard.Earnshaw@arm.com> wrote:
> On 15/12/15 23:34, Evandro Menezes wrote:
>> On 12/14/2015 05:26 AM, James Greenhalgh wrote:
>>> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>>>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>>>
>>>>>>>    gcc/
>>>>>>>
>>>>>>>        * config/aarch64/aarch64.c
>>>>>>> (aarch64_override_options_internal):
>>>>>>>        Increase loop peeling limit.
>>>>>>>
>>>>>>> This patch increases the limit for the number of peeled insns.
>>>>>>> With this change, I noticed no major regression in either
>>>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>>>> ones, improved significantly.
>>>>>>>
>>>>>>> I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
>>>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>>>> >from other stakeholders.
>>>>>>
>>>>>> Ping.
>>>>> I'd like to leave this for a call from the port maintainers. I can
>>>>> see why
>>>>> this leads to more opportunities for vectorization, but I'm
>>>>> concerned about
>>>>> the wider impact on code size. Certainly I wouldn't expect this to
>>>>> be our
>>>>> default at -O2 and below.
>>>>>
>>>>> My gut feeling is that this doesn't really belong in the back-end
>>>>> (there are
>>>>> presumably good reasons why the default for this parameter across
>>>>> GCC has
>>>>> fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
>>>>> like Marcus or Richard to make the call as to whether or not we take
>>>>> this
>>>>> patch.
>>>> Please, correct me if I'm wrong, but loop peeling is enabled only
>>>> with loop unrolling (and with PGO).  If so, then extra code size is
>>>> not a concern, for this heuristic is only active when unrolling
>>>> loops, when code size is already of secondary importance.
>>> My understanding was that loop peeling is enabled from -O2 upwards, and
>>> is also used to partially peel unaligned loops for vectorization
>>> (allowing
>>> the vector code to be well aligned), or to completely peel inner loops
>>> which
>>> may then become amenable to SLP vectorization.
>>>
>>> If I'm wrong then I take back these objections. But I was sure this
>>> parameter was used in a number of situations outside of just
>>> -funroll-loops/-funroll-all-loops . Certainly I remember seeing
>>> performance
>>> sensitivities to this parameter at -O3 in some internal workloads I was
>>> analysing.
>>
>> Vectorization, including SLP, is only enabled at -O3, isn't it?  It
>> seems to me that peeling is only used by optimizations which already
>> lead to potential increase in code size.
>>
>> For instance, with "-Ofast -funroll-all-loops", the total text size for
>> the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB
>> without it; with just "-O2", it is the same at 23.1MB regardless of this
>> setting.
>>
>> So it seems to me that this proposal should be neutral for up to -O2.
>>
>> Thank you,
>>
>
> My preference would be to not diverge from the global parameter
> settings.  I haven't looked in detail at this parameter but it seems to
> me there are two possible paths:
>
> 1) We could get agreement globally that the parameter should be increased.
> 2) We could agree that this specific use of the parameter is distinct
> from some other uses and deserves a new param in its own right with a
> higher value.

I think the fix is to improve the unrolled size estimates by better taking into
account constant propagation and CSE opportunities.  I have some ideas
here but not sure if I have enough free cycles to implement this for GCC 7.

Richard.

> R.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-12-16 11:24               ` Richard Earnshaw (lists)
  2015-12-16 12:42                 ` Richard Biener
@ 2015-12-16 20:11                 ` Evandro Menezes
  2016-01-08 22:55                   ` Evandro Menezes
  1 sibling, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2015-12-16 20:11 UTC (permalink / raw)
  To: Richard Earnshaw (lists), James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, ramana.radhakrishnan, richard.guenther

On 12/16/2015 05:24 AM, Richard Earnshaw (lists) wrote:
> On 15/12/15 23:34, Evandro Menezes wrote:
>> On 12/14/2015 05:26 AM, James Greenhalgh wrote:
>>> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>>>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>>>
>>>>>>>     gcc/
>>>>>>>
>>>>>>>         * config/aarch64/aarch64.c
>>>>>>> (aarch64_override_options_internal):
>>>>>>>         Increase loop peeling limit.
>>>>>>>
>>>>>>> This patch increases the limit for the number of peeled insns.
>>>>>>> With this change, I noticed no major regression in either
>>>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>>>> ones, improved significantly.
>>>>>>>
>>>>>>> I tested this tuning on Exynos M1 and on A57.  ThunderX seems to
>>>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>>>> >from other stakeholders.
>>>>>>
>>>>>> Ping.
>>>>> I'd like to leave this for a call from the port maintainers. I can
>>>>> see why
>>>>> this leads to more opportunities for vectorization, but I'm
>>>>> concerned about
>>>>> the wider impact on code size. Certainly I wouldn't expect this to
>>>>> be our
>>>>> default at -O2 and below.
>>>>>
>>>>> My gut feeling is that this doesn't really belong in the back-end
>>>>> (there are
>>>>> presumably good reasons why the default for this parameter across
>>>>> GCC has
>>>>> fluctuated from 400 to 100 to 200 over recent years), but as I say, I'd
>>>>> like Marcus or Richard to make the call as to whether or not we take
>>>>> this
>>>>> patch.
>>>> Please, correct me if I'm wrong, but loop peeling is enabled only
>>>> with loop unrolling (and with PGO).  If so, then extra code size is
>>>> not a concern, for this heuristic is only active when unrolling
>>>> loops, when code size is already of secondary importance.
>>> My understanding was that loop peeling is enabled from -O2 upwards, and
>>> is also used to partially peel unaligned loops for vectorization
>>> (allowing
>>> the vector code to be well aligned), or to completely peel inner loops
>>> which
>>> may then become amenable to SLP vectorization.
>>>
>>> If I'm wrong then I take back these objections. But I was sure this
>>> parameter was used in a number of situations outside of just
>>> -funroll-loops/-funroll-all-loops . Certainly I remember seeing
>>> performance
>>> sensitivities to this parameter at -O3 in some internal workloads I was
>>> analysing.
>> Vectorization, including SLP, is only enabled at -O3, isn't it?  It
>> seems to me that peeling is only used by optimizations which already
>> lead to potential increase in code size.
>>
>> For instance, with "-Ofast -funroll-all-loops", the total text size for
>> the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB
>> without it; with just "-O2", it is the same at 23.1MB regardless of this
>> setting.
>>
>> So it seems to me that this proposal should be neutral for up to -O2.
>>
>> Thank you,
>>
> My preference would be to not diverge from the global parameter
> settings.  I haven't looked in detail at this parameter but it seems to
> me there are two possible paths:
>
> 1) We could get agreement globally that the parameter should be increased.
> 2) We could agree that this specific use of the parameter is distinct
> from some other uses and deserves a new param in its own right with a
> higher value.
>

Here's what I have observed, not only in AArch64: architectures benefit 
differently from certain loop optimizations, especially those dealing 
with vectorization.  Be it because some have plenty of registers of more 
aggressive loop unrolling, or because some have lower costs to 
vectorize.  With this, I'm trying to imply that there may be the case to 
wiggle this parameter to suit loop optimizations better to specific 
targets.  While it is not the only parameter related to loop 
optimizations, it seems to be the one with the desired effects, as 
exemplified by PPC, S390 and x86 (AOSP).  Though there is the 
possibility that they are actually side-effects, as Richard Biener 
perhaps implied in another reply.

Cheers,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2015-12-16 20:11                 ` Evandro Menezes
@ 2016-01-08 22:55                   ` Evandro Menezes
  2016-02-03 19:46                     ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2016-01-08 22:55 UTC (permalink / raw)
  To: Richard Earnshaw (lists), James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, ramana.radhakrishnan, richard.guenther

On 12/16/2015 02:11 PM, Evandro Menezes wrote:
> On 12/16/2015 05:24 AM, Richard Earnshaw (lists) wrote:
>> On 15/12/15 23:34, Evandro Menezes wrote:
>>> On 12/14/2015 05:26 AM, James Greenhalgh wrote:
>>>> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>>>>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>>>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>>>>
>>>>>>>>     gcc/
>>>>>>>>
>>>>>>>>         * config/aarch64/aarch64.c
>>>>>>>> (aarch64_override_options_internal):
>>>>>>>>         Increase loop peeling limit.
>>>>>>>>
>>>>>>>> This patch increases the limit for the number of peeled insns.
>>>>>>>> With this change, I noticed no major regression in either
>>>>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>>>>> ones, improved significantly.
>>>>>>>>
>>>>>>>> I tested this tuning on Exynos M1 and on A57. ThunderX seems to
>>>>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>>>>> >from other stakeholders.
>>>>>>>
>>>>>>> Ping.
>>>>>> I'd like to leave this for a call from the port maintainers. I can
>>>>>> see why
>>>>>> this leads to more opportunities for vectorization, but I'm
>>>>>> concerned about
>>>>>> the wider impact on code size. Certainly I wouldn't expect this to
>>>>>> be our
>>>>>> default at -O2 and below.
>>>>>>
>>>>>> My gut feeling is that this doesn't really belong in the back-end
>>>>>> (there are
>>>>>> presumably good reasons why the default for this parameter across
>>>>>> GCC has
>>>>>> fluctuated from 400 to 100 to 200 over recent years), but as I 
>>>>>> say, I'd
>>>>>> like Marcus or Richard to make the call as to whether or not we take
>>>>>> this
>>>>>> patch.
>>>>> Please, correct me if I'm wrong, but loop peeling is enabled only
>>>>> with loop unrolling (and with PGO).  If so, then extra code size is
>>>>> not a concern, for this heuristic is only active when unrolling
>>>>> loops, when code size is already of secondary importance.
>>>> My understanding was that loop peeling is enabled from -O2 upwards, 
>>>> and
>>>> is also used to partially peel unaligned loops for vectorization
>>>> (allowing
>>>> the vector code to be well aligned), or to completely peel inner loops
>>>> which
>>>> may then become amenable to SLP vectorization.
>>>>
>>>> If I'm wrong then I take back these objections. But I was sure this
>>>> parameter was used in a number of situations outside of just
>>>> -funroll-loops/-funroll-all-loops . Certainly I remember seeing
>>>> performance
>>>> sensitivities to this parameter at -O3 in some internal workloads I 
>>>> was
>>>> analysing.
>>> Vectorization, including SLP, is only enabled at -O3, isn't it?  It
>>> seems to me that peeling is only used by optimizations which already
>>> lead to potential increase in code size.
>>>
>>> For instance, with "-Ofast -funroll-all-loops", the total text size for
>>> the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB
>>> without it; with just "-O2", it is the same at 23.1MB regardless of 
>>> this
>>> setting.
>>>
>>> So it seems to me that this proposal should be neutral for up to -O2.
>>>
>>> Thank you,
>>>
>> My preference would be to not diverge from the global parameter
>> settings.  I haven't looked in detail at this parameter but it seems to
>> me there are two possible paths:
>>
>> 1) We could get agreement globally that the parameter should be 
>> increased.
>> 2) We could agree that this specific use of the parameter is distinct
>> from some other uses and deserves a new param in its own right with a
>> higher value.
>>
>
> Here's what I have observed, not only in AArch64: architectures 
> benefit differently from certain loop optimizations, especially those 
> dealing with vectorization.  Be it because some have plenty of 
> registers of more aggressive loop unrolling, or because some have 
> lower costs to vectorize.  With this, I'm trying to imply that there 
> may be the case to wiggle this parameter to suit loop optimizations 
> better to specific targets.  While it is not the only parameter 
> related to loop optimizations, it seems to be the one with the desired 
> effects, as exemplified by PPC, S390 and x86 (AOSP).  Though there is 
> the possibility that they are actually side-effects, as Richard Biener 
> perhaps implied in another reply.
>


Gents,

Any new thoughts on this proposal?

Thank you,

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2016-01-08 22:55                   ` Evandro Menezes
@ 2016-02-03 19:46                     ` Evandro Menezes
  2016-03-16 19:48                       ` Evandro Menezes
  0 siblings, 1 reply; 52+ messages in thread
From: Evandro Menezes @ 2016-02-03 19:46 UTC (permalink / raw)
  To: Richard Earnshaw (lists), James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, ramana.radhakrishnan, richard.guenther

On 01/08/16 16:55, Evandro Menezes wrote:
> On 12/16/2015 02:11 PM, Evandro Menezes wrote:
>> On 12/16/2015 05:24 AM, Richard Earnshaw (lists) wrote:
>>> On 15/12/15 23:34, Evandro Menezes wrote:
>>>> On 12/14/2015 05:26 AM, James Greenhalgh wrote:
>>>>> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>>>>>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>>>>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>>>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>>>>>
>>>>>>>>>     gcc/
>>>>>>>>>
>>>>>>>>>         * config/aarch64/aarch64.c
>>>>>>>>> (aarch64_override_options_internal):
>>>>>>>>>         Increase loop peeling limit.
>>>>>>>>>
>>>>>>>>> This patch increases the limit for the number of peeled insns.
>>>>>>>>> With this change, I noticed no major regression in either
>>>>>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>>>>>> ones, improved significantly.
>>>>>>>>>
>>>>>>>>> I tested this tuning on Exynos M1 and on A57. ThunderX seems to
>>>>>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>>>>>> >from other stakeholders.
>>>>>>>>
>>>>>>>> Ping.
>>>>>>> I'd like to leave this for a call from the port maintainers. I can
>>>>>>> see why
>>>>>>> this leads to more opportunities for vectorization, but I'm
>>>>>>> concerned about
>>>>>>> the wider impact on code size. Certainly I wouldn't expect this to
>>>>>>> be our
>>>>>>> default at -O2 and below.
>>>>>>>
>>>>>>> My gut feeling is that this doesn't really belong in the back-end
>>>>>>> (there are
>>>>>>> presumably good reasons why the default for this parameter across
>>>>>>> GCC has
>>>>>>> fluctuated from 400 to 100 to 200 over recent years), but as I 
>>>>>>> say, I'd
>>>>>>> like Marcus or Richard to make the call as to whether or not we 
>>>>>>> take
>>>>>>> this
>>>>>>> patch.
>>>>>> Please, correct me if I'm wrong, but loop peeling is enabled only
>>>>>> with loop unrolling (and with PGO).  If so, then extra code size is
>>>>>> not a concern, for this heuristic is only active when unrolling
>>>>>> loops, when code size is already of secondary importance.
>>>>> My understanding was that loop peeling is enabled from -O2 
>>>>> upwards, and
>>>>> is also used to partially peel unaligned loops for vectorization
>>>>> (allowing
>>>>> the vector code to be well aligned), or to completely peel inner 
>>>>> loops
>>>>> which
>>>>> may then become amenable to SLP vectorization.
>>>>>
>>>>> If I'm wrong then I take back these objections. But I was sure this
>>>>> parameter was used in a number of situations outside of just
>>>>> -funroll-loops/-funroll-all-loops . Certainly I remember seeing
>>>>> performance
>>>>> sensitivities to this parameter at -O3 in some internal workloads 
>>>>> I was
>>>>> analysing.
>>>> Vectorization, including SLP, is only enabled at -O3, isn't it?  It
>>>> seems to me that peeling is only used by optimizations which already
>>>> lead to potential increase in code size.
>>>>
>>>> For instance, with "-Ofast -funroll-all-loops", the total text size 
>>>> for
>>>> the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB
>>>> without it; with just "-O2", it is the same at 23.1MB regardless of 
>>>> this
>>>> setting.
>>>>
>>>> So it seems to me that this proposal should be neutral for up to -O2.
>>>>
>>>> Thank you,
>>>>
>>> My preference would be to not diverge from the global parameter
>>> settings.  I haven't looked in detail at this parameter but it seems to
>>> me there are two possible paths:
>>>
>>> 1) We could get agreement globally that the parameter should be 
>>> increased.
>>> 2) We could agree that this specific use of the parameter is distinct
>>> from some other uses and deserves a new param in its own right with a
>>> higher value.
>>>
>>
>> Here's what I have observed, not only in AArch64: architectures 
>> benefit differently from certain loop optimizations, especially those 
>> dealing with vectorization.  Be it because some have plenty of 
>> registers of more aggressive loop unrolling, or because some have 
>> lower costs to vectorize.  With this, I'm trying to imply that there 
>> may be the case to wiggle this parameter to suit loop optimizations 
>> better to specific targets.  While it is not the only parameter 
>> related to loop optimizations, it seems to be the one with the 
>> desired effects, as exemplified by PPC, S390 and x86 (AOSP).  Though 
>> there is the possibility that they are actually side-effects, as 
>> Richard Biener perhaps implied in another reply.
>>
>
>
> Gents,
>
> Any new thoughts on this proposal?
>

Ping?

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 2/4][AArch64] Increase the loop peeling limit
  2016-02-03 19:46                     ` Evandro Menezes
@ 2016-03-16 19:48                       ` Evandro Menezes
  0 siblings, 0 replies; 52+ messages in thread
From: Evandro Menezes @ 2016-03-16 19:48 UTC (permalink / raw)
  To: Richard Earnshaw (lists), James Greenhalgh
  Cc: 'gcc-patches', 'Marcus Shawcroft',
	'Kyrill Tkachov',
	Andrew Pinski, ramana.radhakrishnan, richard.guenther

On 02/03/16 13:46, Evandro Menezes wrote:
> On 01/08/16 16:55, Evandro Menezes wrote:
>> On 12/16/2015 02:11 PM, Evandro Menezes wrote:
>>> On 12/16/2015 05:24 AM, Richard Earnshaw (lists) wrote:
>>>> On 15/12/15 23:34, Evandro Menezes wrote:
>>>>> On 12/14/2015 05:26 AM, James Greenhalgh wrote:
>>>>>> On Thu, Dec 03, 2015 at 03:07:43PM -0600, Evandro Menezes wrote:
>>>>>>> On 11/20/2015 05:53 AM, James Greenhalgh wrote:
>>>>>>>> On Thu, Nov 19, 2015 at 04:04:41PM -0600, Evandro Menezes wrote:
>>>>>>>>> On 11/05/2015 02:51 PM, Evandro Menezes wrote:
>>>>>>>>>> 2015-11-05  Evandro Menezes <e.menezes@samsung.com>
>>>>>>>>>>
>>>>>>>>>>     gcc/
>>>>>>>>>>
>>>>>>>>>>         * config/aarch64/aarch64.c
>>>>>>>>>> (aarch64_override_options_internal):
>>>>>>>>>>         Increase loop peeling limit.
>>>>>>>>>>
>>>>>>>>>> This patch increases the limit for the number of peeled insns.
>>>>>>>>>> With this change, I noticed no major regression in either
>>>>>>>>>> Geekbench v3 or SPEC CPU2000 while some benchmarks, typically FP
>>>>>>>>>> ones, improved significantly.
>>>>>>>>>>
>>>>>>>>>> I tested this tuning on Exynos M1 and on A57. ThunderX seems to
>>>>>>>>>> benefit from this tuning too.  However, I'd appreciate comments
>>>>>>>>> >from other stakeholders.
>>>>>>>>>
>>>>>>>>> Ping.
>>>>>>>>
>>>>>>>> I'd like to leave this for a call from the port maintainers. I can
>>>>>>>> see why
>>>>>>>> this leads to more opportunities for vectorization, but I'm
>>>>>>>> concerned about
>>>>>>>> the wider impact on code size. Certainly I wouldn't expect this to
>>>>>>>> be our
>>>>>>>> default at -O2 and below.
>>>>>>>>
>>>>>>>> My gut feeling is that this doesn't really belong in the back-end
>>>>>>>> (there are
>>>>>>>> presumably good reasons why the default for this parameter across
>>>>>>>> GCC has
>>>>>>>> fluctuated from 400 to 100 to 200 over recent years), but as I 
>>>>>>>> say, I'd
>>>>>>>> like Marcus or Richard to make the call as to whether or not we 
>>>>>>>> take
>>>>>>>> this
>>>>>>>> patch.
>>>>>>>
>>>>>>> Please, correct me if I'm wrong, but loop peeling is enabled only
>>>>>>> with loop unrolling (and with PGO).  If so, then extra code size is
>>>>>>> not a concern, for this heuristic is only active when unrolling
>>>>>>> loops, when code size is already of secondary importance.
>>>>>>
>>>>>> My understanding was that loop peeling is enabled from -O2 
>>>>>> upwards, and
>>>>>> is also used to partially peel unaligned loops for vectorization
>>>>>> (allowing
>>>>>> the vector code to be well aligned), or to completely peel inner 
>>>>>> loops
>>>>>> which
>>>>>> may then become amenable to SLP vectorization.
>>>>>>
>>>>>> If I'm wrong then I take back these objections. But I was sure this
>>>>>> parameter was used in a number of situations outside of just
>>>>>> -funroll-loops/-funroll-all-loops . Certainly I remember seeing
>>>>>> performance
>>>>>> sensitivities to this parameter at -O3 in some internal workloads 
>>>>>> I was
>>>>>> analysing.
>>>>>
>>>>> Vectorization, including SLP, is only enabled at -O3, isn't it?  It
>>>>> seems to me that peeling is only used by optimizations which already
>>>>> lead to potential increase in code size.
>>>>>
>>>>> For instance, with "-Ofast -funroll-all-loops", the total text 
>>>>> size for
>>>>> the SPEC CPU2000 suite is 26.9MB with this proposed change and 26.8MB
>>>>> without it; with just "-O2", it is the same at 23.1MB regardless 
>>>>> of this
>>>>> setting.
>>>>>
>>>>> So it seems to me that this proposal should be neutral for up to -O2.
>>>>
>>>> My preference would be to not diverge from the global parameter
>>>> settings.  I haven't looked in detail at this parameter but it 
>>>> seems to
>>>> me there are two possible paths:
>>>>
>>>> 1) We could get agreement globally that the parameter should be 
>>>> increased.
>>>> 2) We could agree that this specific use of the parameter is distinct
>>>> from some other uses and deserves a new param in its own right with a
>>>> higher value.
>>>
>>> Here's what I have observed, not only in AArch64: architectures 
>>> benefit differently from certain loop optimizations, especially 
>>> those dealing with vectorization. Be it because some have plenty of 
>>> registers of more aggressive loop unrolling, or because some have 
>>> lower costs to vectorize.  With this, I'm trying to imply that there 
>>> may be the case to wiggle this parameter to suit loop optimizations 
>>> better to specific targets.  While it is not the only parameter 
>>> related to loop optimizations, it seems to be the one with the 
>>> desired effects, as exemplified by PPC, S390 and x86 (AOSP).  Though 
>>> there is the possibility that they are actually side-effects, as 
>>> Richard Biener perhaps implied in another reply.
>>
>> Gents,
>>
>> Any new thoughts on this proposal?
>
> Ping?

Ping^2

-- 
Evandro Menezes

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2016-03-16 19:48 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-27 23:38 [AArch64] Add scheduling and cost models for Exynos M1 Evandro Menezes
2015-10-28 10:40 ` James Greenhalgh
2015-10-28 10:45   ` Andrew Pinski
2015-10-28 21:58     ` Evandro Menezes
2015-10-29 23:02   ` Evandro Menezes
2015-10-28 11:01 ` Kyrill Tkachov
2015-10-29 19:38   ` Evandro Menezes
2015-11-04 23:10 ` [PATCH 0/4][AArch64] " Evandro Menezes
2015-11-04 23:18   ` [PATCH 1/4][AArch64] " Evandro Menezes
2015-11-04 23:21     ` Evandro Menezes
2015-11-05  9:22       ` James Greenhalgh
2015-11-05 17:31         ` Evandro Menezes
2015-11-12 14:47           ` James Greenhalgh
2015-11-05 20:51   ` [PATCH 2/4][AArch64] Increase the loop peeling limit Evandro Menezes
2015-11-19 22:04     ` Evandro Menezes
2015-11-20 11:53       ` James Greenhalgh
2015-12-03 21:07         ` Evandro Menezes
2015-12-14 11:26           ` James Greenhalgh
2015-12-15 23:34             ` Evandro Menezes
2015-12-16 11:24               ` Richard Earnshaw (lists)
2015-12-16 12:42                 ` Richard Biener
2015-12-16 20:11                 ` Evandro Menezes
2016-01-08 22:55                   ` Evandro Menezes
2016-02-03 19:46                     ` Evandro Menezes
2016-03-16 19:48                       ` Evandro Menezes
2015-11-05 23:30   ` [PATCH 3/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
2015-11-09 23:06     ` Evandro Menezes
2015-11-10 17:50       ` [PATCH 3a/4][AArch64] Add attribute for compatibility with ARM pipeline models Evandro Menezes
2015-11-10 18:01         ` Ramana Radhakrishnan
2015-11-10 18:03           ` Ramana Radhakrishnan
2015-11-12 14:55         ` James Greenhalgh
2015-11-12 15:39           ` Evandro Menezes
2015-11-12 17:32             ` Evandro Menezes
2015-11-19 22:05               ` Evandro Menezes
2015-11-20 12:27               ` James Greenhalgh
2015-11-20 14:34                 ` Kyrill Tkachov
2015-11-20 15:56                   ` Evandro Menezes
2015-11-20 15:55                 ` Evandro Menezes
2015-11-20 16:16                   ` James Greenhalgh
2015-11-10 17:54       ` [PATCH 3b/4][AArch64] Add scheduling model for Exynos M1 Evandro Menezes
2015-11-19 22:06         ` Evandro Menezes
2015-11-20 17:17         ` James Greenhalgh
2015-11-20 22:07           ` Evandro Menezes
2015-12-03 20:58           ` Evandro Menezes
2015-12-04  9:25             ` Kyrill Tkachov
2015-12-07 19:55               ` Evandro Menezes
2015-11-05 23:30   ` [PATCH 3/4][AArch64] " Evandro Menezes
2015-11-06  0:09   ` [PATCH 4/4][AArch64] Add cost " Evandro Menezes
2015-11-19 22:06     ` Evandro Menezes
2015-11-20 17:19       ` James Greenhalgh
2015-11-24  9:56     ` Kyrill Tkachov
2015-12-03 20:49     ` Evandro Menezes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).