[PATCH][4.6][ARM] New CPU support for Faraday cores

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][4.6][ARM] New CPU support for Faraday cores
@ 2010-11-22  7:55 M.F. Wu
  2010-11-22 18:36 ` Joseph S. Myers
  0 siblings, 1 reply; 11+ messages in thread
From: M.F. Wu @ 2010-11-22  7:55 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 259 bytes --]

Hi,
This is the patch for gcc-4.6 with Faraday CPU support.
The supported CPUs are FA526/FA626/FA606TE/FA626TE/
FMP626/FA726TE.Test with arm-none-eabi and no extra
regressions are found.
                                                               mingfeng

[-- Attachment #2: gcc-4.6-svn-20101116-faraday-cpu.patch --]
[-- Type: application/octet-stream, Size: 47364 bytes --]

--- tmp/gcc_svn/gcc/ChangeLog	2010-11-22 13:17:55.685455000 +0800
+++ gcc_svn/gcc/ChangeLog	2010-11-22 13:29:39.038672000 +0800
@@ -1,3 +1,23 @@
+2010-11-22  Toolchain  <toolchain@faraday-tech.com>
+
+	Add Faraday CPU support - FA526/FA626/FA606TE/FA626TE/FMP626/FA726TE.
+	Modify files:
+	* config/arm/arm-cores.def
+	* config/arm/arm-tune.md
+	* config/arm/arm.c
+	* config/arm/arm.md
+	* config/arm/bpabi.h
+	* config/arm/t-arm
+	* config/arm/t-arm-elf
+	* config/arm/t-linux-eabi
+	New added files:
+	* config/arm/fa526.md
+	* config/arm/fa626.md
+	* config/arm/fa606te.md
+	* config/arm/fa626te.md
+	* config/arm/fmp626.md
+	* config/arm/fa726te.md
+
 2010-11-13  Paolo Bonzini  <bonzini@gnu.org>
 
 	PR c/46462
diff -uNr tmp/gcc_svn/gcc/config/arm/arm-cores.def gcc_svn/gcc/config/arm/arm-cores.def
--- tmp/gcc_svn/gcc/config/arm/arm-cores.def	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/arm-cores.def	2010-11-17 10:06:02.662526000 +0800
@@ -74,6 +74,8 @@
 ARM_CORE("strongarm110",  strongarm110,	4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1100", strongarm1100, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1110", strongarm1110, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE("fa526",         fa526,        4,                               FL_LDSCHED, fastmul)
+ARM_CORE("fa626",         fa626,        4,                               FL_LDSCHED, fastmul)
 
 /* V4T Architecture Processors */
 ARM_CORE("arm7tdmi",      arm7tdmi,	4T,	FL_CO_PROC          , fastmul)
@@ -104,6 +106,10 @@
 ARM_CORE("xscale",        xscale,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale)
 ARM_CORE("iwmmxt",        iwmmxt,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
 ARM_CORE("iwmmxt2",       iwmmxt2,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
+ARM_CORE("fa606te",       fa606te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa626te",       fa626te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fmp626",        fmp626,       5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa726te",       fa726te,      5TE,                             FL_LDSCHED, fa726te)
 
 /* V5TEJ Architecture Processors */
 ARM_CORE("arm926ej-s",    arm926ejs,	5TEJ,	                         FL_LDSCHED, 9e)
diff -uNr tmp/gcc_svn/gcc/config/arm/arm-tune.md gcc_svn/gcc/config/arm/arm-tune.md
--- tmp/gcc_svn/gcc/config/arm/arm-tune.md	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/arm-tune.md	2010-11-17 10:06:02.665527000 +0800
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from arm-cores.def
 (define_attr "tune"
-	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
+	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff -uNr tmp/gcc_svn/gcc/config/arm/arm.c gcc_svn/gcc/config/arm/arm.c
--- tmp/gcc_svn/gcc/config/arm/arm.c	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/arm.c	2010-11-17 10:06:02.747527000 +0800
@@ -128,6 +128,7 @@
 static void thumb1_output_function_prologue (FILE *, HOST_WIDE_INT);
 static int arm_comp_type_attributes (const_tree, const_tree);
 static void arm_set_default_type_attributes (tree);
+static int arm_sched_variable_issue (FILE *, int, rtx, int);
 static int arm_adjust_cost (rtx, rtx, rtx, int);
 static int count_insns_for_constant (HOST_WIDE_INT, int);
 static int arm_get_strip_length (int);
@@ -239,6 +240,7 @@
 static rtx arm_pic_static_addr (rtx orig, rtx reg);
 static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
 static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
 static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
 static bool arm_class_likely_spilled_p (reg_class_t);
 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
@@ -350,6 +352,9 @@
 #undef  TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
 #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes
 
+#undef  TARGET_SCHED_VARIABLE_ISSUE
+#define TARGET_SCHED_VARIABLE_ISSUE arm_sched_variable_issue
+
 #undef  TARGET_SCHED_ADJUST_COST
 #define TARGET_SCHED_ADJUST_COST arm_adjust_cost
 
@@ -862,6 +867,13 @@
   1
 };
 
+const struct tune_params arm_fa726te_tune =
+{
+  arm_9e_rtx_costs,
+  fa726te_sched_adjust_cost,
+  1
+};
+
 
 /* Not all of these give usefully different compilation alternatives,
    but there is no simple way of generalizing them.  */
@@ -7913,6 +7925,56 @@
   return true;
 }
 
+/* Adjust cost hook for FA726TE.  */
+static bool
+fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+  /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated)
+     have penalty of 3 */
+  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
+      && recog_memoized (insn) >= 0
+      && recog_memoized (dep)  >= 0
+      && get_attr_conds (dep) == CONDS_SET)
+    {
+      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
+      if (get_attr_conds(insn)  == CONDS_USE &&
+          get_attr_type(insn) != TYPE_BRANCH)
+        {
+          *cost = 3;
+          return false;
+        }
+
+      if (GET_CODE (PATTERN (insn)) == COND_EXEC
+          || get_attr_conds(insn)  == CONDS_USE)
+        {
+          *cost = 0;
+          return false;
+        }
+    }
+
+  return true;
+}
+
+/* Determine how many instructions can we issue. Fixup the issue that some
+   UNSPECs get scheduled. */
+static int
+arm_sched_variable_issue (FILE *f ATTRIBUTE_UNUSED,
+                           int verbose  ATTRIBUTE_UNUSED, rtx insn, int more)
+{
+  if (arm_tune == fa726te
+      && recog_memoized (insn) >= 0 /* insn recognizable? */
+      && (get_attr_type (insn) == TYPE_ALU
+          || get_attr_type (insn) == TYPE_ALU_SHIFT
+          || get_attr_type (insn) == TYPE_ALU_SHIFT_REG))
+    {
+       return more;
+    }
+  else
+    {
+       return more-1;
+    }
+}
+
 /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
    It corrects the value of COST based on the relationship between
    INSN and DEP through the dependence LINK.  It returns the new
@@ -22722,6 +22784,7 @@
     case cortexa5:
     case cortexa8:
     case cortexa9:
+    case fa726te:
       return 2;
 
     default:
diff -uNr tmp/gcc_svn/gcc/config/arm/arm.md gcc_svn/gcc/config/arm/arm.md
--- tmp/gcc_svn/gcc/config/arm/arm.md	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/arm.md	2010-11-17 10:06:02.788490000 +0800
@@ -498,7 +498,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
+          (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -516,6 +516,11 @@
 (include "arm1020e.md")
 (include "arm1026ejs.md")
 (include "arm1136jfs.md")
+(include "fa526.md")
+(include "fa606te.md")
+(include "fa626te.md")
+(include "fmp626.md")
+(include "fa726te.md")
 (include "cortex-a5.md")
 (include "cortex-a8.md")
 (include "cortex-a9.md")
diff -uNr tmp/gcc_svn/gcc/config/arm/bpabi.h gcc_svn/gcc/config/arm/bpabi.h
--- tmp/gcc_svn/gcc/config/arm/bpabi.h	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/bpabi.h	2010-11-17 10:06:02.792483000 +0800
@@ -52,7 +52,7 @@
 /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
 #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
 
-#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
+#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa5*|mcpu=fa626:--fix-v4bx}"
 
 #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}"
 
diff -uNr tmp/gcc_svn/gcc/config/arm/fa526.md gcc_svn/gcc/config/arm/fa526.md
--- tmp/gcc_svn/gcc/config/arm/fa526.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc_svn/gcc/config/arm/fa526.md	2010-11-17 10:06:02.797482000 +0800
@@ -0,0 +1,163 @@
+;; Faraday FA526 Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.  */
+
+;; These descriptions are based on the information contained in the
+;; FA526 Core Design Note, Copyright (c) 2006 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 3 (2 cycle penalty)
+;; ALU -> any use: latency = 2 (1 cycle penalty)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA526 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa526_core" "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "526_alu_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu"))
+ "fa526_core")
+
+(define_insn_reservation "526_alu_shift_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa526_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "526_mult1" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy"))
+ "fa526_core")
+
+(define_insn_reservation "526_mult2" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\
+                       umlals,smulls,smlals,smlawx"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "526_load1_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load1,load_byte"))
+ "fa526_core")
+
+(define_insn_reservation "526_load2_op" 4
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_load3_op" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_load4_op" 6
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load4"))
+ "fa526_core*4")
+
+(define_insn_reservation "526_store1_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store1"))
+ "fa526_core")
+
+(define_insn_reservation "526_store2_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_store3_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_store4_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store4"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "526_branch_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "branch"))
+ "fa526_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set 
+;; by a mov instruction, which has 1 cycle latency.
+(define_insn_reservation "526_call_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "call"))
+ "fa526_core")
+
diff -uNr tmp/gcc_svn/gcc/config/arm/fa606te.md gcc_svn/gcc/config/arm/fa606te.md
--- tmp/gcc_svn/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc_svn/gcc/config/arm/fa606te.md	2010-11-17 10:06:02.802475000 +0800
@@ -0,0 +1,173 @@
+;; Faraday FA606TE Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.  */
+
+;; These descriptions are based on the information contained in the
+;; FA606TE Core Design Note, Copyright (c) 2006 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 2 (1 cycle penalty)
+;; ALU -> any use: latency = 1 (0 cycle penalty)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA606TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      E      M      W
+
+(define_cpu_unit "fa606te_core" "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "606te_alu_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "alu,alu_shift,alu_shift_reg"))
+ "fa606te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "606te_mult1" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlalxy"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_mult2" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_mult3" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "mul,mla,muls,mlas"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_mult4" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals"))
+ "fa606te_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "606te_load1_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_load2_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_load3_op" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_load4_op" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load4"))
+ "fa606te_core*4")
+
+(define_insn_reservation "606te_store1_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store1"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_store2_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_store3_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_store4_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store4"))
+ "fa606te_core*4")
+
+
+;;(define_insn_reservation "606te_ldm_op" 9
+;; (and (eq_attr "tune" "fa606te")
+;;      (eq_attr "type" "load2,load3,load4,store2,store3,store4"))
+;; "fa606te_core*7")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycles to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "606te_branch_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "branch"))
+ "fa606te_core")
+
+;; The latency for a call is actually the latency when the result being available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set by a 
+;; mov instruction, which has 1 cycle latency
+(define_insn_reservation "606te_call_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "call"))
+ "fa606te_core")
+
diff -uNr tmp/gcc_svn/gcc/config/arm/fa626te.md gcc_svn/gcc/config/arm/fa626te.md
--- tmp/gcc_svn/gcc/config/arm/fa626te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc_svn/gcc/config/arm/fa626te.md	2010-11-17 10:06:02.807470000 +0800
@@ -0,0 +1,167 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.  */
+
+;; These descriptions are based on the information contained in the
+;; FA626TE Core Design Note, Copyright (c) 2006 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; ALU -> simple address LDR/STR: latency=2 (available after 2 cycles)
+;; ALU -> shifted address LDR/STR: latency=3
+;;		( extra 1 cycle unavoidable stall)
+;; ALU -> other use: latency=2 (available after 2 cycles)
+;; LD  -> simple address LDR/STR: latency=3 (available after 3 cycles)
+;; LD  -> shifted address LDR/STR: latency=4 
+;;		( extra 1 cycle unavoidable stall)
+;; LD  -> any other use: latency = 3 (available after 3 cycles)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA626TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa626te_core" "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "626te_alu_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_alu_shift_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa626te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "626te_mult1" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult2" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "mul,mla"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult3" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_mult4" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "626te_load1_op" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_load2_op" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load2,load3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_load3_op" 5
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load4"))
+ "fa626te_core*3")
+
+(define_insn_reservation "626te_store1_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store1"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_store2_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store2,store3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_store3_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store4"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "626te_branch_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "branch"))
+ "fa626te_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. 
+(define_insn_reservation "626te_call_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "call"))
+ "fa626te_core")
+
diff -uNr tmp/gcc_svn/gcc/config/arm/fa726te.md gcc_svn/gcc/config/arm/fa726te.md
--- tmp/gcc_svn/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc_svn/gcc/config/arm/fa726te.md	2010-11-17 10:06:02.813461000 +0800
@@ -0,0 +1,222 @@
+;; Faraday FA726TE Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.  */
+
+;; These descriptions are based on the information contained in the
+;; FA726TE Core Design Note, Copyright (c) 2006 Faraday Technology Corp.
+;;
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA726TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa726te")
+(automata_option "ndfa")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;	E1	E2	E3	E4	E5	WB
+;;______________________________________________________
+;;
+;;      <-------------- LD/ST ----------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
+;;______________________________________________________
+;;
+;;      <---------- MUL --------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
+
+
+(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
+(define_cpu_unit "fa726te_mac_pipe" "fa726te")
+(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
+;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
+;; improve code quality
+(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
+(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
+
+(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
+;; reservation which blocks IS
+(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; Move instructions.
+(define_insn_reservation "726te_shift_op" 1
+  (and (eq_attr "tune" "fa726te")
+       (eq_attr "insn" "mov,mvn"))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with no shifted operand will finished in 1 cycle
+;; Other ALU instructions 2 cycles
+(define_insn_reservation "726te_alu_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with a shift-by-register operand
+;; These really stall in the decoder, in order to read
+;; the shift value in a second cycle. Pretend we take two cycles in
+;; the execute stage.
+;; If shift+LU, it takes 2 cycles. If shift+AU, it takes 3 cycles.
+(define_insn_reservation "726te_alu_shift_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+(define_insn_reservation "726te_alu_shift_reg_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift_reg")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times. Multiply operations occur in both the execute and memory
+;; stages of the pipeline
+
+(define_insn_reservation "726te_mult_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
+                       umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
+ "fa726te_issue+fa726te_mac_pipe")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+;; Loads with a shifted offset take 3 cycles, and are (a) probably the
+;; most common and (b) the pessimistic assumption will lead to fewer stalls.
+
+;; Scalar loads are pipelined in FA726TE LSU pipe.
+;; Here we model the resource conflict between Load@E3-stage & Store@W-stage
+;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
+;; same "bundle", the 2nd load will introudce another ISSUE stall but is still
+;; ok to execute (and may be benefical sometimes)
+
+(define_insn_reservation "726te_load1_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load1,load_byte"))
+ "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
+  | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
+
+(define_insn_reservation "726te_store1_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store1"))
+ "fa726te_blockage*2")
+
+;; Load/Store Multiple blocks all pipelines in EX stages until WB 
+;; No other instructions can be issued together.
+;; Since they essentially prevent all scheduling opportunities, we model them
+;; together here.
+
+;; If LDM is breaking into multiple load instructions, later instruction in
+;; pipe 1 is stalled
+(define_insn_reservation "726te_ldm2_op" 4
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load2,load3"))
+ "fa726te_blockage*4")
+
+(define_insn_reservation "726te_ldm3_op" 5
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load4"))
+ "fa726te_blockage*5")
+
+(define_insn_reservation "726te_stm2_op" 2
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store2,store3"))
+ "fa726te_blockage*3")
+
+(define_insn_reservation "726te_stm3_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store4"))
+ "fa726te_blockage*4")
+
+(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
+                  726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
+                 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
+                 "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
+(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
+
+(define_bypass 4 "726te_load1_op" "726te_mult_op")
+(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
+(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "726te_branch_op" 0
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "branch"))
+ "fa726te_blockage")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 is ready for int return value. 
+(define_insn_reservation "726te_call_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "call"))
+ "fa726te_blockage")
+
diff -uNr tmp/gcc_svn/gcc/config/arm/fmp626.md gcc_svn/gcc/config/arm/fmp626.md
--- tmp/gcc_svn/gcc/config/arm/fmp626.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc_svn/gcc/config/arm/fmp626.md	2010-11-17 10:06:02.817458000 +0800
@@ -0,0 +1,184 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.  */
+
+;; These descriptions are based on the information contained in the
+;; FMP626 Core Design Note, Copyright (c) 2006 Faraday Technology Corp.
+;;
+
+;; Pipeline architecture
+;;	S	E	M	W(Q1)	Q2
+;;   ___________________________________________
+;;    shifter alu    
+;;    mul1    mul2    mul3
+;;    ld/st1  ld/st2  ld/st3  ld/st4  ld/st5
+
+;; This automaton provides a pipeline description for the Faraday
+;; FMP626 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+(define_cpu_unit "fmp626_core" "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "mp626_alu_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_alu_shift_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fmp626_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "mp626_mult1" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult2" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "mul,mla"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult3" 3
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_mult4" 4
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fmp626_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "mp626_load1_op" 5
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load1,load_byte"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_load2_op" 6
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load2,load3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_load3_op" 7
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load4"))
+ "fmp626_core*3")
+
+(define_insn_reservation "mp626_store1_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store1"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_store2_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store2,store3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_store3_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store4"))
+ "fmp626_core*3")
+
+(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op"
+                 "mp626_store1_op,mp626_store2_op,mp626_store3_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\
+                  mp626_mult3,mp626_mult4" "mp626_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op")
+(define_bypass 2 "mp626_mult3" "mp626_alu_op")
+(define_bypass 3 "mp626_mult4" "mp626_alu_op")
+(define_bypass 4 "mp626_load1_op" "mp626_alu_op")
+(define_bypass 5 "mp626_load2_op" "mp626_alu_op")
+(define_bypass 6 "mp626_load3_op" "mp626_alu_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "mp626_branch_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "branch"))
+ "fmp626_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.
+(define_insn_reservation "mp626_call_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "call"))
+ "fmp626_core")
+
diff -uNr tmp/gcc_svn/gcc/config/arm/t-arm gcc_svn/gcc/config/arm/t-arm
--- tmp/gcc_svn/gcc/config/arm/t-arm	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/t-arm	2010-11-17 10:06:02.820462000 +0800
@@ -24,6 +24,11 @@
 		$(srcdir)/config/arm/arm1020e.md \
 		$(srcdir)/config/arm/arm1026ejs.md \
 		$(srcdir)/config/arm/arm1136jfs.md \
+		$(srcdir)/config/arm/fa526.md \
+		$(srcdir)/config/arm/fa606te.md \
+		$(srcdir)/config/arm/fa626te.md \
+		$(srcdir)/config/arm/fmp626.md \
+		$(srcdir)/config/arm/fa726te.md \
 		$(srcdir)/config/arm/arm926ejs.md \
 		$(srcdir)/config/arm/cirrus.md \
 		$(srcdir)/config/arm/fpa.md \
diff -uNr tmp/gcc_svn/gcc/config/arm/t-arm-elf gcc_svn/gcc/config/arm/t-arm-elf
--- tmp/gcc_svn/gcc/config/arm/t-arm-elf	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/t-arm-elf	2010-11-17 10:07:57.748600000 +0800
@@ -36,6 +36,10 @@
 MULTILIB_EXCEPTIONS  = 
 MULTILIB_MATCHES     =
 
+#MULTILIB_OPTIONS     += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te/mcpu=arm926ej-s
+#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te arm926ej-s
+#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
+
 #MULTILIB_OPTIONS      += march=armv7
 #MULTILIB_DIRNAMES     += thumb2
 #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
@@ -52,6 +56,8 @@
 MULTILIB_OPTIONS       += mfloat-abi=hard
 MULTILIB_DIRNAMES      += fpu
 MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
 
 # MULTILIB_OPTIONS    += mcpu=ep9312
 # MULTILIB_DIRNAMES   += ep9312
diff -uNr tmp/gcc_svn/gcc/config/arm/t-linux-eabi gcc_svn/gcc/config/arm/t-linux-eabi
--- tmp/gcc_svn/gcc/config/arm/t-linux-eabi	2010-11-15 16:25:28.000000000 +0800
+++ gcc_svn/gcc/config/arm/t-linux-eabi	2010-11-17 10:08:09.581461000 +0800
@@ -24,6 +24,10 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
+#MULTILIB_OPTIONS     += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te arm926ej-s
+#MULTILIB_EXCEPTIONS  += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te*
+
 # Use a version of div0 which raises SIGFPE, and a special __clear_cache.
 LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-22  7:55 [PATCH][4.6][ARM] New CPU support for Faraday cores M.F. Wu
@ 2010-11-22 18:36 ` Joseph S. Myers
  2010-11-23 10:11   ` M.F. Wu
  0 siblings, 1 reply; 11+ messages in thread
From: Joseph S. Myers @ 2010-11-22 18:36 UTC (permalink / raw)
  To: M.F. Wu; +Cc: gcc-patches

Generic comments:

Copyright and license notices on new files must be in the standard GCC 
form (GPLv3+, no FSF postal address).

If you add new -mcpu values you must update the documentation in 
invoke.texi to mention them.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-22 18:36 ` Joseph S. Myers
@ 2010-11-23 10:11   ` M.F. Wu
  2010-11-24 10:17     ` Ramana Radhakrishnan
  2010-11-24 12:20     ` Ramana Radhakrishnan
  0 siblings, 2 replies; 11+ messages in thread
From: M.F. Wu @ 2010-11-23 10:11 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 515 bytes --]

Dear Joseph,

Thank you for your comments.

I have updated the new added files and add
-mcpu info in doc/invoke.texi.

                                             Mingfeng

2010/11/23 Joseph S. Myers <joseph@codesourcery.com>
>
> Generic comments:
>
> Copyright and license notices on new files must be in the standard GCC
> form (GPLv3+, no FSF postal address).
>
> If you add new -mcpu values you must update the documentation in
> invoke.texi to mention them.
>
> --
> Joseph S. Myers
> joseph@codesourcery.com

[-- Attachment #2: gcc-4.6-svn-20101116-faraday-cpu.patch --]
[-- Type: application/octet-stream, Size: 48484 bytes --]

diff -uNr tmp/gcc-4.6-svn-20101116/gcc/ChangeLog gcc-4.6-svn-20101116/gcc/ChangeLog
--- tmp/gcc-4.6-svn-20101116/gcc/ChangeLog	2010-11-23 13:43:41.725061000 +0800
+++ gcc-4.6-svn-20101116/gcc/ChangeLog	2010-11-23 14:40:20.796047000 +0800
@@ -1,3 +1,24 @@
+2010-11-22  Toolchain  <toolchain@faraday-tech.com>
+
+	Add Faraday CPU support - FA526/FA626/FA606TE/FA626TE/FMP626/FA726TE.
+	Modify files:
+	* config/arm/arm-cores.def
+	* config/arm/arm-tune.md
+	* config/arm/arm.c
+	* config/arm/arm.md
+	* config/arm/bpabi.h
+	* config/arm/t-arm
+	* config/arm/t-arm-elf
+	* config/arm/t-linux-eabi
+	* doc/invoke.texi
+	New added files:
+	* config/arm/fa526.md
+	* config/arm/fa626.md
+	* config/arm/fa606te.md
+	* config/arm/fa626te.md
+	* config/arm/fmp626.md
+	* config/arm/fa726te.md
+
 2010-11-13  Paolo Bonzini  <bonzini@gnu.org>
 
 	PR c/46462
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def	2010-11-23 13:43:46.812579000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def	2010-11-23 13:57:43.556323000 +0800
@@ -74,6 +74,8 @@
 ARM_CORE("strongarm110",  strongarm110,	4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1100", strongarm1100, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1110", strongarm1110, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE("fa526",         fa526,        4,                               FL_LDSCHED, fastmul)
+ARM_CORE("fa626",         fa626,        4,                               FL_LDSCHED, fastmul)
 
 /* V4T Architecture Processors */
 ARM_CORE("arm7tdmi",      arm7tdmi,	4T,	FL_CO_PROC          , fastmul)
@@ -104,6 +106,10 @@
 ARM_CORE("xscale",        xscale,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale)
 ARM_CORE("iwmmxt",        iwmmxt,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
 ARM_CORE("iwmmxt2",       iwmmxt2,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
+ARM_CORE("fa606te",       fa606te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa626te",       fa626te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fmp626",        fmp626,       5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa726te",       fa726te,      5TE,                             FL_LDSCHED, fa726te)
 
 /* V5TEJ Architecture Processors */
 ARM_CORE("arm926ej-s",    arm926ejs,	5TEJ,	                         FL_LDSCHED, 9e)
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md	2010-11-23 13:43:46.829584000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md	2010-11-23 13:57:43.559341000 +0800
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from arm-cores.def
 (define_attr "tune"
-	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
+	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c gcc-4.6-svn-20101116/gcc/config/arm/arm.c
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c	2010-11-23 13:43:46.895582000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm.c	2010-11-23 13:57:43.649325000 +0800
@@ -128,6 +128,7 @@
 static void thumb1_output_function_prologue (FILE *, HOST_WIDE_INT);
 static int arm_comp_type_attributes (const_tree, const_tree);
 static void arm_set_default_type_attributes (tree);
+static int arm_sched_variable_issue (FILE *, int, rtx, int);
 static int arm_adjust_cost (rtx, rtx, rtx, int);
 static int count_insns_for_constant (HOST_WIDE_INT, int);
 static int arm_get_strip_length (int);
@@ -239,6 +240,7 @@
 static rtx arm_pic_static_addr (rtx orig, rtx reg);
 static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
 static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
 static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
 static bool arm_class_likely_spilled_p (reg_class_t);
 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
@@ -350,6 +352,9 @@
 #undef  TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
 #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes
 
+#undef  TARGET_SCHED_VARIABLE_ISSUE
+#define TARGET_SCHED_VARIABLE_ISSUE arm_sched_variable_issue
+
 #undef  TARGET_SCHED_ADJUST_COST
 #define TARGET_SCHED_ADJUST_COST arm_adjust_cost
 
@@ -862,6 +867,13 @@
   1
 };
 
+const struct tune_params arm_fa726te_tune =
+{
+  arm_9e_rtx_costs,
+  fa726te_sched_adjust_cost,
+  1
+};
+
 
 /* Not all of these give usefully different compilation alternatives,
    but there is no simple way of generalizing them.  */
@@ -7913,6 +7925,56 @@
   return true;
 }
 
+/* Adjust cost hook for FA726TE.  */
+static bool
+fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+  /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated)
+     have penalty of 3 */
+  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
+      && recog_memoized (insn) >= 0
+      && recog_memoized (dep)  >= 0
+      && get_attr_conds (dep) == CONDS_SET)
+    {
+      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
+      if (get_attr_conds(insn)  == CONDS_USE &&
+          get_attr_type(insn) != TYPE_BRANCH)
+        {
+          *cost = 3;
+          return false;
+        }
+
+      if (GET_CODE (PATTERN (insn)) == COND_EXEC
+          || get_attr_conds(insn)  == CONDS_USE)
+        {
+          *cost = 0;
+          return false;
+        }
+    }
+
+  return true;
+}
+
+/* Determine how many instructions can we issue. Fixup the issue that some
+   UNSPECs get scheduled. */
+static int
+arm_sched_variable_issue (FILE *f ATTRIBUTE_UNUSED,
+                           int verbose  ATTRIBUTE_UNUSED, rtx insn, int more)
+{
+  if (arm_tune == fa726te
+      && recog_memoized (insn) >= 0 /* insn recognizable? */
+      && (get_attr_type (insn) == TYPE_ALU
+          || get_attr_type (insn) == TYPE_ALU_SHIFT
+          || get_attr_type (insn) == TYPE_ALU_SHIFT_REG))
+    {
+       return more;
+    }
+  else
+    {
+       return more-1;
+    }
+}
+
 /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
    It corrects the value of COST based on the relationship between
    INSN and DEP through the dependence LINK.  It returns the new
@@ -22722,6 +22784,7 @@
     case cortexa5:
     case cortexa8:
     case cortexa9:
+    case fa726te:
       return 2;
 
     default:
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.md gcc-4.6-svn-20101116/gcc/config/arm/arm.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.md	2010-11-23 13:43:46.939579000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm.md	2010-11-23 13:57:43.697327000 +0800
@@ -498,7 +498,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
+          (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -516,6 +516,11 @@
 (include "arm1020e.md")
 (include "arm1026ejs.md")
 (include "arm1136jfs.md")
+(include "fa526.md")
+(include "fa606te.md")
+(include "fa626te.md")
+(include "fmp626.md")
+(include "fa726te.md")
 (include "cortex-a5.md")
 (include "cortex-a8.md")
 (include "cortex-a9.md")
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h	2010-11-23 13:43:46.998580000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h	2010-11-23 13:57:43.702332000 +0800
@@ -52,7 +52,7 @@
 /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
 #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
 
-#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
+#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa5*|mcpu=fa626:--fix-v4bx}"
 
 #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}"
 
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md	2010-11-23 14:36:17.916371000 +0800
@@ -0,0 +1,161 @@
+;; Faraday FA526 Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 3 (2 cycle penalty)
+;; ALU -> any use: latency = 2 (1 cycle penalty)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA526 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa526_core" "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "526_alu_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu"))
+ "fa526_core")
+
+(define_insn_reservation "526_alu_shift_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa526_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "526_mult1" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy"))
+ "fa526_core")
+
+(define_insn_reservation "526_mult2" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\
+                       umlals,smulls,smlals,smlawx"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "526_load1_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load1,load_byte"))
+ "fa526_core")
+
+(define_insn_reservation "526_load2_op" 4
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_load3_op" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_load4_op" 6
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load4"))
+ "fa526_core*4")
+
+(define_insn_reservation "526_store1_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store1"))
+ "fa526_core")
+
+(define_insn_reservation "526_store2_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_store3_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_store4_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store4"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "526_branch_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "branch"))
+ "fa526_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set 
+;; by a mov instruction, which has 1 cycle latency.
+(define_insn_reservation "526_call_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "call"))
+ "fa526_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md	2010-11-23 14:36:35.789647000 +0800
@@ -0,0 +1,172 @@
+;; Faraday FA606TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA606TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 2 (1 cycle penalty)
+;; ALU -> any use: latency = 1 (0 cycle penalty)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA606TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      E      M      W
+
+(define_cpu_unit "fa606te_core" "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "606te_alu_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "alu,alu_shift,alu_shift_reg"))
+ "fa606te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "606te_mult1" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlalxy"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_mult2" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_mult3" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "mul,mla,muls,mlas"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_mult4" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals"))
+ "fa606te_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "606te_load1_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_load2_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_load3_op" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_load4_op" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load4"))
+ "fa606te_core*4")
+
+(define_insn_reservation "606te_store1_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store1"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_store2_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_store3_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_store4_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store4"))
+ "fa606te_core*4")
+
+
+;;(define_insn_reservation "606te_ldm_op" 9
+;; (and (eq_attr "tune" "fa606te")
+;;      (eq_attr "type" "load2,load3,load4,store2,store3,store4"))
+;; "fa606te_core*7")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycles to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "606te_branch_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "branch"))
+ "fa606te_core")
+
+;; The latency for a call is actually the latency when the result being available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set by a 
+;; mov instruction, which has 1 cycle latency
+(define_insn_reservation "606te_call_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "call"))
+ "fa606te_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md	2010-11-23 14:35:59.928102000 +0800
@@ -0,0 +1,166 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA626TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; ALU -> simple address LDR/STR: latency=2 (available after 2 cycles)
+;; ALU -> shifted address LDR/STR: latency=3
+;;		( extra 1 cycle unavoidable stall)
+;; ALU -> other use: latency=2 (available after 2 cycles)
+;; LD  -> simple address LDR/STR: latency=3 (available after 3 cycles)
+;; LD  -> shifted address LDR/STR: latency=4 
+;;		( extra 1 cycle unavoidable stall)
+;; LD  -> any other use: latency = 3 (available after 3 cycles)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA626TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa626te_core" "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "626te_alu_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_alu_shift_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa626te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "626te_mult1" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult2" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "mul,mla"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult3" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_mult4" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "626te_load1_op" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_load2_op" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load2,load3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_load3_op" 5
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load4"))
+ "fa626te_core*3")
+
+(define_insn_reservation "626te_store1_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store1"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_store2_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store2,store3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_store3_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store4"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "626te_branch_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "branch"))
+ "fa626te_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. 
+(define_insn_reservation "626te_call_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "call"))
+ "fa626te_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md	2010-11-23 14:37:04.268859000 +0800
@@ -0,0 +1,221 @@
+;; Faraday FA726TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA726TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa726te")
+(automata_option "ndfa")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;	E1	E2	E3	E4	E5	WB
+;;______________________________________________________
+;;
+;;      <-------------- LD/ST ----------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
+;;______________________________________________________
+;;
+;;      <---------- MUL --------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
+
+
+(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
+(define_cpu_unit "fa726te_mac_pipe" "fa726te")
+(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
+;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
+;; improve code quality
+(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
+(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
+
+(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
+;; reservation which blocks IS
+(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; Move instructions.
+(define_insn_reservation "726te_shift_op" 1
+  (and (eq_attr "tune" "fa726te")
+       (eq_attr "insn" "mov,mvn"))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with no shifted operand will finished in 1 cycle
+;; Other ALU instructions 2 cycles
+(define_insn_reservation "726te_alu_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with a shift-by-register operand
+;; These really stall in the decoder, in order to read
+;; the shift value in a second cycle. Pretend we take two cycles in
+;; the execute stage.
+;; If shift+LU, it takes 2 cycles. If shift+AU, it takes 3 cycles.
+(define_insn_reservation "726te_alu_shift_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+(define_insn_reservation "726te_alu_shift_reg_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift_reg")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times. Multiply operations occur in both the execute and memory
+;; stages of the pipeline
+
+(define_insn_reservation "726te_mult_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
+                       umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
+ "fa726te_issue+fa726te_mac_pipe")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+;; Loads with a shifted offset take 3 cycles, and are (a) probably the
+;; most common and (b) the pessimistic assumption will lead to fewer stalls.
+
+;; Scalar loads are pipelined in FA726TE LSU pipe.
+;; Here we model the resource conflict between Load@E3-stage & Store@W-stage
+;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
+;; same "bundle", the 2nd load will introudce another ISSUE stall but is still
+;; ok to execute (and may be benefical sometimes)
+
+(define_insn_reservation "726te_load1_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load1,load_byte"))
+ "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
+  | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
+
+(define_insn_reservation "726te_store1_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store1"))
+ "fa726te_blockage*2")
+
+;; Load/Store Multiple blocks all pipelines in EX stages until WB 
+;; No other instructions can be issued together.
+;; Since they essentially prevent all scheduling opportunities, we model them
+;; together here.
+
+;; If LDM is breaking into multiple load instructions, later instruction in
+;; pipe 1 is stalled
+(define_insn_reservation "726te_ldm2_op" 4
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load2,load3"))
+ "fa726te_blockage*4")
+
+(define_insn_reservation "726te_ldm3_op" 5
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load4"))
+ "fa726te_blockage*5")
+
+(define_insn_reservation "726te_stm2_op" 2
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store2,store3"))
+ "fa726te_blockage*3")
+
+(define_insn_reservation "726te_stm3_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store4"))
+ "fa726te_blockage*4")
+
+(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
+                  726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
+                 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
+                 "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
+(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
+
+(define_bypass 4 "726te_load1_op" "726te_mult_op")
+(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
+(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "726te_branch_op" 0
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "branch"))
+ "fa726te_blockage")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 is ready for int return value. 
+(define_insn_reservation "726te_call_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "call"))
+ "fa726te_blockage")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md	2010-11-23 14:36:50.883213000 +0800
@@ -0,0 +1,183 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FMP626 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; Pipeline architecture
+;;	S	E	M	W(Q1)	Q2
+;;   ___________________________________________
+;;    shifter alu    
+;;    mul1    mul2    mul3
+;;    ld/st1  ld/st2  ld/st3  ld/st4  ld/st5
+
+;; This automaton provides a pipeline description for the Faraday
+;; FMP626 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+(define_cpu_unit "fmp626_core" "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "mp626_alu_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_alu_shift_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fmp626_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "mp626_mult1" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult2" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "mul,mla"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult3" 3
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_mult4" 4
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fmp626_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "mp626_load1_op" 5
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load1,load_byte"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_load2_op" 6
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load2,load3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_load3_op" 7
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load4"))
+ "fmp626_core*3")
+
+(define_insn_reservation "mp626_store1_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store1"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_store2_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store2,store3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_store3_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store4"))
+ "fmp626_core*3")
+
+(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op"
+                 "mp626_store1_op,mp626_store2_op,mp626_store3_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\
+                  mp626_mult3,mp626_mult4" "mp626_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op")
+(define_bypass 2 "mp626_mult3" "mp626_alu_op")
+(define_bypass 3 "mp626_mult4" "mp626_alu_op")
+(define_bypass 4 "mp626_load1_op" "mp626_alu_op")
+(define_bypass 5 "mp626_load2_op" "mp626_alu_op")
+(define_bypass 6 "mp626_load3_op" "mp626_alu_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "mp626_branch_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "branch"))
+ "fmp626_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.
+(define_insn_reservation "mp626_call_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "call"))
+ "fmp626_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm gcc-4.6-svn-20101116/gcc/config/arm/t-arm
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm	2010-11-23 13:43:47.213582000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm	2010-11-23 13:57:43.738329000 +0800
@@ -24,6 +24,11 @@
 		$(srcdir)/config/arm/arm1020e.md \
 		$(srcdir)/config/arm/arm1026ejs.md \
 		$(srcdir)/config/arm/arm1136jfs.md \
+		$(srcdir)/config/arm/fa526.md \
+		$(srcdir)/config/arm/fa606te.md \
+		$(srcdir)/config/arm/fa626te.md \
+		$(srcdir)/config/arm/fmp626.md \
+		$(srcdir)/config/arm/fa726te.md \
 		$(srcdir)/config/arm/arm926ejs.md \
 		$(srcdir)/config/arm/cirrus.md \
 		$(srcdir)/config/arm/fpa.md \
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf	2010-11-23 13:43:47.221580000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf	2010-11-23 13:57:43.744348000 +0800
@@ -36,6 +36,10 @@
 MULTILIB_EXCEPTIONS  = 
 MULTILIB_MATCHES     =
 
+#MULTILIB_OPTIONS     += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te/mcpu=arm926ej-s
+#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te arm926ej-s
+#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
+
 #MULTILIB_OPTIONS      += march=armv7
 #MULTILIB_DIRNAMES     += thumb2
 #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
@@ -52,6 +56,8 @@
 MULTILIB_OPTIONS       += mfloat-abi=hard
 MULTILIB_DIRNAMES      += fpu
 MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
 
 # MULTILIB_OPTIONS    += mcpu=ep9312
 # MULTILIB_DIRNAMES   += ep9312
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi	2010-11-23 13:43:47.231588000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi	2010-11-23 13:57:43.750343000 +0800
@@ -24,6 +24,10 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
+#MULTILIB_OPTIONS     += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te arm926ej-s
+#MULTILIB_EXCEPTIONS  += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te*
+
 # Use a version of div0 which raises SIGFPE, and a special __clear_cache.
 LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache
 
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/doc/invoke.texi gcc-4.6-svn-20101116/gcc/doc/invoke.texi
--- tmp/gcc-4.6-svn-20101116/gcc/doc/invoke.texi	2010-11-23 13:44:15.846799000 +0800
+++ gcc-4.6-svn-20101116/gcc/doc/invoke.texi	2010-11-23 14:33:07.490613000 +0800
@@ -10115,7 +10115,9 @@
 @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-m4}, @samp{cortex-m3},
 @samp{cortex-m1},
 @samp{cortex-m0},
-@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312},
+@samp{fa526}, @samp{fa626},
+@samp{fa606te}, @samp{fa626te}, @samp{fmp626}, @samp{fa726te}.
 
 @item -mtune=@var{name}
 @opindex mtune

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-23 10:11   ` M.F. Wu
@ 2010-11-24 10:17     ` Ramana Radhakrishnan
  2010-11-25 11:31       ` M.F. Wu
  2010-11-24 12:20     ` Ramana Radhakrishnan
  1 sibling, 1 reply; 11+ messages in thread
From: Ramana Radhakrishnan @ 2010-11-24 10:17 UTC (permalink / raw)
  To: M.F. Wu; +Cc: Joseph S. Myers, gcc-patches

Hi Mingfeng,

I'm not a maintainer and cannot approve or reject your patch. However I
have a few comments / queries regarding your patch.


> 
> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/ChangeLog
> gcc-4.6-svn-20101116/gcc/ChangeLog
> --- tmp/gcc-4.6-svn-20101116/gcc/ChangeLog    2010-11-23
> 13:43:41.725061000 +0800
> +++ gcc-4.6-svn-20101116/gcc/ChangeLog        2010-11-23
> 14:40:20.796047000 +0800
> @@ -1,3 +1,24 @@
> +2010-11-22  Toolchain  <toolchain@faraday-tech.com>
> 
It's generally been the practice to have a Changelog entry refer to real
people rather than a generic support address. I believe Faraday Tech has
a generic copyright assignment on file that covers this contribution but
it's usually been our practice to state explicit authors. Thus I think
that the authors of this patch should be named in the Changelog entry
even if there are multiple authors.

> 
> +
> +     Add Faraday CPU support -
> FA526/FA626/FA606TE/FA626TE/FMP626/FA726TE.
> +     Modify files:
> +     * config/arm/arm-cores.def
> +     * config/arm/arm-tune.md
> +     * config/arm/arm.c
> +     * config/arm/arm.md
> +     * config/arm/bpabi.h
> +     * config/arm/t-arm
> +     * config/arm/t-arm-elf
> +     * config/arm/t-linux-eabi
> +     * doc/invoke.texi
> +     New added files:
> +     * config/arm/fa526.md
> +     * config/arm/fa626.md
> +     * config/arm/fa606te.md
> +     * config/arm/fa626te.md
> +     * config/arm/fmp626.md
> +     * config/arm/fa726te.md
> +
> 

Please create a ChangeLog as specified in the GNU coding standards
linked from http://gcc.gnu.org/contribute.html#standards . The ChangeLog
should not be a part of the final patch you submit but a part of the
mail you send out.

Please look at the Changelog file or other patches in the mailing list
archives for other examples in that area. Thanks. 

If there are new files (like fa526.md etc.) they can just be marked as
New.  If there are changes to existing files for e.g. config/arm/arm.c ,
a small description of the changes to them must be listed in the
Changelog entry.



> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c
> gcc-4.6-svn-20101116/gcc/config/arm/arm.c
> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c     2010-11-23
> 13:43:46.895582000 +0800
> > +++ gcc-4.6-svn-20101116/gcc/config/arm/arm.c 2010-11-23
> 13:57:43.649325000 +0800
> > @@ -128,6 +128,7 @@
> >  static void thumb1_output_function_prologue (FILE *,
> HOST_WIDE_INT);
> >  static int arm_comp_type_attributes (const_tree, const_tree);
> >  static void arm_set_default_type_attributes (tree);
> > +static int arm_sched_variable_issue (FILE *, int, rtx, int);
> >  static int arm_adjust_cost (rtx, rtx, rtx, int);
> >  static int count_insns_for_constant (HOST_WIDE_INT, int);
> >  static int arm_get_strip_length (int);
> > @@ -239,6 +240,7 @@
> >  static rtx arm_pic_static_addr (rtx orig, rtx reg);
> >  static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
> >  static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
> > +static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
> >  static enum machine_mode arm_preferred_simd_mode (enum
> machine_mode);
> >  static bool arm_class_likely_spilled_p (reg_class_t);
> >  static bool arm_vector_alignment_reachable (const_tree type, bool
> is_packed);
> > @@ -350,6 +352,9 @@
> >  #undef  TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
> >  #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
> arm_set_default_type_attributes
> >  
> > +#undef  TARGET_SCHED_VARIABLE_ISSUE
> > +#define TARGET_SCHED_VARIABLE_ISSUE arm_sched_variable_issue
> > +
> > @@ -7913,6 +7925,56 @@
> >    return true;
> >  }
> >  
> > +/* Adjust cost hook for FA726TE.  */
> > +static bool
> > +fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
> > +{
> > +  /* For FA726TE, true dependency on CPSR (i.e. set cond followed
> by predicated)
> > +     have penalty of 3 */
> > +  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
> > +      && recog_memoized (insn) >= 0
> > +      && recog_memoized (dep)  >= 0
> > +      && get_attr_conds (dep) == CONDS_SET)
> > +    {
> > +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle
> latency */
> > +      if (get_attr_conds(insn)  == CONDS_USE &&
> > +          get_attr_type(insn) != TYPE_BRANCH)
> > +        {
> > +          *cost = 3;
> > +          return false;
> > +        }
> > +
> > +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
> > +          || get_attr_conds(insn)  == CONDS_USE)
> > +        {
> > +          *cost = 0;
> > +          return false;
> > +        }
> > +    }
> > +
> > +  return true;
> > +}
> > +
> > +/* Determine how many instructions can we issue. Fixup the issue
> that some
> > +   UNSPECs get scheduled. */
> > +static int
> > +arm_sched_variable_issue (FILE *f ATTRIBUTE_UNUSED,
> > +                           int verbose  ATTRIBUTE_UNUSED, rtx insn,
> int more)

Can you make sure that int verbose is aligned to be just below the
column for FILE *f ?

> 
> > +{
> > +  if (arm_tune == fa726te
> > +      && recog_memoized (insn) >= 0 /* insn recognizable? */
> > +      && (get_attr_type (insn) == TYPE_ALU
> > +          || get_attr_type (insn) == TYPE_ALU_SHIFT
> > +          || get_attr_type (insn) == TYPE_ALU_SHIFT_REG))
> 
Given that we are moving towards a backend where core specific
information is 
stored in the tune_params structure , it might be an idea to add
sched_variable_issue 
to the costs infrastructure initialize this to NULL in other cases 
and return more - 1 by default.


> 
> > +  else
> > +    {
> > +       return more-1;
> 
This should be "return more - 1;" Notice the extra spaces between more
'-' and '1'.


> 
> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
> 08:00:00.000000000 +0800
> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
> 14:36:17.916371000 +0800
> 
> <....>
> snipped
> 
> > +
> >
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> > +;; Branch and Call Instructions
> >
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> > +
> > +;; Branch instructions are difficult to model accurately.  The ARM
s/ARM/FA526.

And equivalently in all the other pipeline descriptions. 


> > 
> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm
> gcc-4.6-svn-20101116/gcc/config/arm/t-arm
> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm     2010-11-23
> 13:43:47.213582000 +0800
> > +++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm 2010-11-23
> 13:57:43.738329000 +0800
> > @@ -24,6 +24,11 @@
> >               $(srcdir)/config/arm/arm1020e.md \
> >               $(srcdir)/config/arm/arm1026ejs.md \
> >               $(srcdir)/config/arm/arm1136jfs.md \
> > +             $(srcdir)/config/arm/fa526.md \
> > +             $(srcdir)/config/arm/fa606te.md \
> > +             $(srcdir)/config/arm/fa626te.md \
> > +             $(srcdir)/config/arm/fmp626.md \
> > +             $(srcdir)/config/arm/fa726te.md \
> >               $(srcdir)/config/arm/arm926ejs.md \
> >               $(srcdir)/config/arm/cirrus.md \
> >               $(srcdir)/config/arm/fpa.md \
> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf
> gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf
> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf 2010-11-23
> 13:43:47.221580000 +0800
> > +++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf     2010-11-23
> 13:57:43.744348000 +0800
> > @@ -36,6 +36,10 @@
> >  MULTILIB_EXCEPTIONS  = 
> >  MULTILIB_MATCHES     =
> > +#MULTILIB_OPTIONS     +=
> mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te/mcpu=arm926ej-s
> > +#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te
> arm926ej-s
> 

Even though these are commented I assume that these are in here because
they are known
to work and you expect it to keep working. 

Why do you have arm926ej-s here ? 

> 
> > +#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
> > +
> 
> 
> >  #MULTILIB_OPTIONS      += march=armv7
> >  #MULTILIB_DIRNAMES     += thumb2
> >  #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
> > @@ -52,6 +56,8 @@
> >  MULTILIB_OPTIONS       += mfloat-abi=hard
> >  MULTILIB_DIRNAMES      += fpu
> >  MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
> > +MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
> > +MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
> >  
> >  # MULTILIB_OPTIONS    += mcpu=ep9312
> >  # MULTILIB_DIRNAMES   += ep9312
> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi
> gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi
> > ---
> tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi      2010-11-23
> 13:43:47.231588000 +0800
> > +++ gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi  2010-11-23
> 13:57:43.750343000 +0800
> > @@ -24,6 +24,10 @@
> >  MULTILIB_OPTIONS     =
> >  MULTILIB_DIRNAMES    =
> > +#MULTILIB_OPTIONS     +=
> mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
> > +#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te arm926ej-s
> 
These variables even though commented out should work correctly when
uncommented.  

Why does this have arm926ej-s for MULTILIB_DIRNAMES with no
corresponding entry for mcpu=arm926ej-s in MULTILIB_OPTIONS ? Each
entry in MULTILIB_DIRNAMES must have an equivalent entry in
MULTILIB_OPTIONS. Even though the extra entry for arm926ej-s is
superfluous, it would be better to remove it from here. 


Cheers
Ramana


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-23 10:11   ` M.F. Wu
  2010-11-24 10:17     ` Ramana Radhakrishnan
@ 2010-11-24 12:20     ` Ramana Radhakrishnan
  1 sibling, 0 replies; 11+ messages in thread
From: Ramana Radhakrishnan @ 2010-11-24 12:20 UTC (permalink / raw)
  To: M.F. Wu; +Cc: Joseph S. Myers, gcc-patches



> /* Determine how many instructions can we issue. Fixup the issue that
> some
> +   UNSPECs get scheduled. */
> +static int
> +arm_sched_variable_issue (FILE *f ATTRIBUTE_UNUSED,
> +                           int verbose  ATTRIBUTE_UNUSED, rtx insn,
> int more)
> +{
> +  if (arm_tune == fa726te
> +      && recog_memoized (insn) >= 0 /* insn recognizable? */
> +      && (get_attr_type (insn) == TYPE_ALU
> +          || get_attr_type (insn) == TYPE_ALU_SHIFT
> +          || get_attr_type (insn) == TYPE_ALU_SHIFT_REG))
> +    {
> +       return more;
> +    }
> +  else
> +    {
> +       return more-1;
> +    }
> +}

It would be be good practice to indicate in the dump file what you are
doing with a particular insn. regarding this. Otherwise we'd have to
fire up a debugger to figure out what's going on here. 

I am not totally sure if this is correct ? You are returning more - 1 in
the case where the insn might be a USE or a CLOBBER while the
documentation states that you've got to return a more - 1 only in the
case of valid insn's.  

Thus you probably want to make that check more robust in the else case
where you return more - 1 today.

else
{  
if (NONDEBUG_INSN_P (insn) && GET_CODE (PATTERN (insn)) != USE
   && GET_CODE (PATTERN (insn)) != CLOBBER)
   return more - 1;
else 
    return more;
}


cheers
Ramana




On Tue, 2010-11-23 at 14:49 +0800, M.F. Wu wrote:
> Dear Joseph,
> 
> Thank you for your comments.
> 
> I have updated the new added files and add
> -mcpu info in doc/invoke.texi.
> 
>                                              Mingfeng
> 
> 2010/11/23 Joseph S. Myers <joseph@codesourcery.com>
> >
> > Generic comments:
> >
> > Copyright and license notices on new files must be in the standard GCC
> > form (GPLv3+, no FSF postal address).
> >
> > If you add new -mcpu values you must update the documentation in
> > invoke.texi to mention them.
> >
> > --
> > Joseph S. Myers
> > joseph@codesourcery.com


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-24 10:17     ` Ramana Radhakrishnan
@ 2010-11-25 11:31       ` M.F. Wu
  2010-11-30 14:51         ` Ramana Radhakrishnan
  0 siblings, 1 reply; 11+ messages in thread
From: M.F. Wu @ 2010-11-25 11:31 UTC (permalink / raw)
  To: ramana.radhakrishnan; +Cc: Joseph S. Myers, gcc-patches, toolchain

[-- Attachment #1: Type: text/plain, Size: 10526 bytes --]

Dear Ramana,

Thank you for your advisement.

                                  Mingfeng Wu

2010/11/24 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> Hi Mingfeng,
>
> I'm not a maintainer and cannot approve or reject your patch. However I
> have a few comments / queries regarding your patch.
>
>
>>
>> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/ChangeLog
>> gcc-4.6-svn-20101116/gcc/ChangeLog
>> --- tmp/gcc-4.6-svn-20101116/gcc/ChangeLog    2010-11-23
>> 13:43:41.725061000 +0800
>> +++ gcc-4.6-svn-20101116/gcc/ChangeLog        2010-11-23
>> 14:40:20.796047000 +0800
>> @@ -1,3 +1,24 @@
>> +2010-11-22  Toolchain  <toolchain@faraday-tech.com>
>>
> It's generally been the practice to have a Changelog entry refer to real
> people rather than a generic support address. I believe Faraday Tech has
> a generic copyright assignment on file that covers this contribution but
> it's usually been our practice to state explicit authors. Thus I think
> that the authors of this patch should be named in the Changelog entry
> even if there are multiple authors.
>

I have added the authors to the list and modified the Changelog entry.

>>
>> +
>> +     Add Faraday CPU support -
>> FA526/FA626/FA606TE/FA626TE/FMP626/FA726TE.
>> +     Modify files:
>> +     * config/arm/arm-cores.def
>> +     * config/arm/arm-tune.md
>> +     * config/arm/arm.c
>> +     * config/arm/arm.md
>> +     * config/arm/bpabi.h
>> +     * config/arm/t-arm
>> +     * config/arm/t-arm-elf
>> +     * config/arm/t-linux-eabi
>> +     * doc/invoke.texi
>> +     New added files:
>> +     * config/arm/fa526.md
>> +     * config/arm/fa626.md
>> +     * config/arm/fa606te.md
>> +     * config/arm/fa626te.md
>> +     * config/arm/fmp626.md
>> +     * config/arm/fa726te.md
>> +
>>
>
> Please create a ChangeLog as specified in the GNU coding standards
> linked from http://gcc.gnu.org/contribute.html#standards . The ChangeLog
> should not be a part of the final patch you submit but a part of the
> mail you send out.
>
> Please look at the Changelog file or other patches in the mailing list
> archives for other examples in that area. Thanks.
>
> If there are new files (like fa526.md etc.) they can just be marked as
> New.  If there are changes to existing files for e.g. config/arm/arm.c ,
> a small description of the changes to them must be listed in the
> Changelog entry.
>
>
>
>> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c
>> gcc-4.6-svn-20101116/gcc/config/arm/arm.c
>> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c     2010-11-23
>> 13:43:46.895582000 +0800
>> > +++ gcc-4.6-svn-20101116/gcc/config/arm/arm.c 2010-11-23
>> 13:57:43.649325000 +0800
>> > @@ -128,6 +128,7 @@
>> >  static void thumb1_output_function_prologue (FILE *,
>> HOST_WIDE_INT);
>> >  static int arm_comp_type_attributes (const_tree, const_tree);
>> >  static void arm_set_default_type_attributes (tree);
>> > +static int arm_sched_variable_issue (FILE *, int, rtx, int);
>> >  static int arm_adjust_cost (rtx, rtx, rtx, int);
>> >  static int count_insns_for_constant (HOST_WIDE_INT, int);
>> >  static int arm_get_strip_length (int);
>> > @@ -239,6 +240,7 @@
>> >  static rtx arm_pic_static_addr (rtx orig, rtx reg);
>> >  static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
>> >  static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
>> > +static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
>> >  static enum machine_mode arm_preferred_simd_mode (enum
>> machine_mode);
>> >  static bool arm_class_likely_spilled_p (reg_class_t);
>> >  static bool arm_vector_alignment_reachable (const_tree type, bool
>> is_packed);
>> > @@ -350,6 +352,9 @@
>> >  #undef  TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
>> >  #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
>> arm_set_default_type_attributes
>> >
>> > +#undef  TARGET_SCHED_VARIABLE_ISSUE
>> > +#define TARGET_SCHED_VARIABLE_ISSUE arm_sched_variable_issue
>> > +
>> > @@ -7913,6 +7925,56 @@
>> >    return true;
>> >  }
>> >
>> > +/* Adjust cost hook for FA726TE.  */
>> > +static bool
>> > +fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
>> > +{
>> > +  /* For FA726TE, true dependency on CPSR (i.e. set cond followed
>> by predicated)
>> > +     have penalty of 3 */
>> > +  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
>> > +      && recog_memoized (insn) >= 0
>> > +      && recog_memoized (dep)  >= 0
>> > +      && get_attr_conds (dep) == CONDS_SET)
>> > +    {
>> > +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle
>> latency */
>> > +      if (get_attr_conds(insn)  == CONDS_USE &&
>> > +          get_attr_type(insn) != TYPE_BRANCH)
>> > +        {
>> > +          *cost = 3;
>> > +          return false;
>> > +        }
>> > +
>> > +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
>> > +          || get_attr_conds(insn)  == CONDS_USE)
>> > +        {
>> > +          *cost = 0;
>> > +          return false;
>> > +        }
>> > +    }
>> > +
>> > +  return true;
>> > +}
>> > +
>> > +/* Determine how many instructions can we issue. Fixup the issue
>> that some
>> > +   UNSPECs get scheduled. */
>> > +static int
>> > +arm_sched_variable_issue (FILE *f ATTRIBUTE_UNUSED,
>> > +                           int verbose  ATTRIBUTE_UNUSED, rtx insn,
>> int more)
>
> Can you make sure that int verbose is aligned to be just below the
> column for FILE *f ?
>
>>
>> > +{
>> > +  if (arm_tune == fa726te
>> > +      && recog_memoized (insn) >= 0 /* insn recognizable? */
>> > +      && (get_attr_type (insn) == TYPE_ALU
>> > +          || get_attr_type (insn) == TYPE_ALU_SHIFT
>> > +          || get_attr_type (insn) == TYPE_ALU_SHIFT_REG))
>>
> Given that we are moving towards a backend where core specific
> information is
> stored in the tune_params structure , it might be an idea to add
> sched_variable_issue
> to the costs infrastructure initialize this to NULL in other cases
> and return more - 1 by default.
>

Function, arm_sched_variable_issue, has been removed.

>
>>
>> > +  else
>> > +    {
>> > +       return more-1;
>>
> This should be "return more - 1;" Notice the extra spaces between more
> '-' and '1'.
>
>
>>
>> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
>> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
>> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
>> 08:00:00.000000000 +0800
>> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
>> 14:36:17.916371000 +0800
>>
>> <....>
>> snipped
>>
>> > +
>> >
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> > +;; Branch and Call Instructions
>> >
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> > +
>> > +;; Branch instructions are difficult to model accurately.  The ARM
> s/ARM/FA526.
>
> And equivalently in all the other pipeline descriptions.
>

Sorry, I don't understand exactly what you mean...

>
>> >
>> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm
>> gcc-4.6-svn-20101116/gcc/config/arm/t-arm
>> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm     2010-11-23
>> 13:43:47.213582000 +0800
>> > +++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm 2010-11-23
>> 13:57:43.738329000 +0800
>> > @@ -24,6 +24,11 @@
>> >               $(srcdir)/config/arm/arm1020e.md \
>> >               $(srcdir)/config/arm/arm1026ejs.md \
>> >               $(srcdir)/config/arm/arm1136jfs.md \
>> > +             $(srcdir)/config/arm/fa526.md \
>> > +             $(srcdir)/config/arm/fa606te.md \
>> > +             $(srcdir)/config/arm/fa626te.md \
>> > +             $(srcdir)/config/arm/fmp626.md \
>> > +             $(srcdir)/config/arm/fa726te.md \
>> >               $(srcdir)/config/arm/arm926ejs.md \
>> >               $(srcdir)/config/arm/cirrus.md \
>> >               $(srcdir)/config/arm/fpa.md \
>> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf
>> gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf
>> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf 2010-11-23
>> 13:43:47.221580000 +0800
>> > +++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf     2010-11-23
>> 13:57:43.744348000 +0800
>> > @@ -36,6 +36,10 @@
>> >  MULTILIB_EXCEPTIONS  =
>> >  MULTILIB_MATCHES     =
>> > +#MULTILIB_OPTIONS     +=
>> mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te/mcpu=arm926ej-s
>> > +#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te
>> arm926ej-s
>>
>
> Even though these are commented I assume that these are in here because
> they are known
> to work and you expect it to keep working.
>
> Why do you have arm926ej-s here ?
>

Removed arm926ej-s.

>>
>> > +#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
>> > +
>>
>>
>> >  #MULTILIB_OPTIONS      += march=armv7
>> >  #MULTILIB_DIRNAMES     += thumb2
>> >  #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
>> > @@ -52,6 +56,8 @@
>> >  MULTILIB_OPTIONS       += mfloat-abi=hard
>> >  MULTILIB_DIRNAMES      += fpu
>> >  MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
>> > +MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
>> > +MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
>> >
>> >  # MULTILIB_OPTIONS    += mcpu=ep9312
>> >  # MULTILIB_DIRNAMES   += ep9312
>> > diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi
>> gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi
>> > ---
>> tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi      2010-11-23
>> 13:43:47.231588000 +0800
>> > +++ gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi  2010-11-23
>> 13:57:43.750343000 +0800
>> > @@ -24,6 +24,10 @@
>> >  MULTILIB_OPTIONS     =
>> >  MULTILIB_DIRNAMES    =
>> > +#MULTILIB_OPTIONS     +=
>> mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
>> > +#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te arm926ej-s
>>
> These variables even though commented out should work correctly when
> uncommented.
>
> Why does this have arm926ej-s for MULTILIB_DIRNAMES with no
> corresponding entry for mcpu=arm926ej-s in MULTILIB_OPTIONS ? Each
> entry in MULTILIB_DIRNAMES must have an equivalent entry in
> MULTILIB_OPTIONS. Even though the extra entry for arm926ej-s is
> superfluous, it would be better to remove it from here.
>

Removed arm926ej-s.

>
> Cheers
> Ramana
>
>
>

[-- Attachment #2: gcc-4.6-svn-20101116-faraday-cpu.patch --]
[-- Type: application/octet-stream, Size: 47702 bytes --]

diff -uNr tmp/gcc-4.6-svn-20101116/gcc/ChangeLog gcc-4.6-svn-20101116/gcc/ChangeLog
--- tmp/gcc-4.6-svn-20101116/gcc/ChangeLog	2010-11-23 13:43:41.725061000 +0800
+++ gcc-4.6-svn-20101116/gcc/ChangeLog	2010-11-25 17:13:37.966265000 +0800
@@ -1,3 +1,28 @@
+2010-11-22  Sanjin Liu  <scliu@faraday-tech.com>
+	    Mingfeng Wu  <mingfeng@faraday-tech.com>
+
+	* config/arm/arm-cores.def: Add Faraday CPU support - 
+	fa526/fa626/fa606te/fa626te/fmp626/fa726te.
+	* config/arm/arm-tune.md: Regenerate.
+	* config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te.
+	(fa726te_sched_adjust_cost): New cost function for fa726te.
+	(arm_issue_rate): Add fa726te.
+	* config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
+	and include machine description files.
+	* config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
+	* config/arm/t-arm (MD_INCLUDES): Include machine description files for
+	Faraday cores.
+	* config/arm/t-arm-elf: Add multilib option for Faraday cores.
+	* config/arm/t-linux-eabi: Add multilib option for Faraday cores except
+	fa526 and fa626.
+	* doc/invoke.texi: Document -mcpu for Faraday cores.
+	* config/arm/fa526.md: New file.
+	* config/arm/fa626.md: New file.
+	* config/arm/fa606te.md: New file.
+	* config/arm/fa626te.md: New file.
+	* config/arm/fmp626.md: New file.
+	* config/arm/fa726te.md: New file.
+
 2010-11-13  Paolo Bonzini  <bonzini@gnu.org>
 
 	PR c/46462
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def	2010-11-23 13:43:46.812579000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def	2010-11-23 13:57:43.556323000 +0800
@@ -74,6 +74,8 @@
 ARM_CORE("strongarm110",  strongarm110,	4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1100", strongarm1100, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1110", strongarm1110, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE("fa526",         fa526,        4,                               FL_LDSCHED, fastmul)
+ARM_CORE("fa626",         fa626,        4,                               FL_LDSCHED, fastmul)
 
 /* V4T Architecture Processors */
 ARM_CORE("arm7tdmi",      arm7tdmi,	4T,	FL_CO_PROC          , fastmul)
@@ -104,6 +106,10 @@
 ARM_CORE("xscale",        xscale,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale)
 ARM_CORE("iwmmxt",        iwmmxt,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
 ARM_CORE("iwmmxt2",       iwmmxt2,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
+ARM_CORE("fa606te",       fa606te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa626te",       fa626te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fmp626",        fmp626,       5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa726te",       fa726te,      5TE,                             FL_LDSCHED, fa726te)
 
 /* V5TEJ Architecture Processors */
 ARM_CORE("arm926ej-s",    arm926ejs,	5TEJ,	                         FL_LDSCHED, 9e)
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md	2010-11-23 13:43:46.829584000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md	2010-11-23 13:57:43.559341000 +0800
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from arm-cores.def
 (define_attr "tune"
-	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
+	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c gcc-4.6-svn-20101116/gcc/config/arm/arm.c
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c	2010-11-23 13:43:46.895582000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm.c	2010-11-25 16:54:54.813726000 +0800
@@ -239,6 +239,7 @@
 static rtx arm_pic_static_addr (rtx orig, rtx reg);
 static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
 static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
 static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
 static bool arm_class_likely_spilled_p (reg_class_t);
 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
@@ -862,6 +863,13 @@
   1
 };
 
+const struct tune_params arm_fa726te_tune =
+{
+  arm_9e_rtx_costs,
+  fa726te_sched_adjust_cost,
+  1
+};
+
 
 /* Not all of these give usefully different compilation alternatives,
    but there is no simple way of generalizing them.  */
@@ -7913,6 +7921,36 @@
   return true;
 }
 
+/* Adjust cost hook for FA726TE.  */
+static bool
+fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+  /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated)
+     have penalty of 3 */
+  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
+      && recog_memoized (insn) >= 0
+      && recog_memoized (dep)  >= 0
+      && get_attr_conds (dep) == CONDS_SET)
+    {
+      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
+      if (get_attr_conds(insn)  == CONDS_USE &&
+          get_attr_type(insn) != TYPE_BRANCH)
+        {
+          *cost = 3;
+          return false;
+        }
+
+      if (GET_CODE (PATTERN (insn)) == COND_EXEC
+          || get_attr_conds(insn)  == CONDS_USE)
+        {
+          *cost = 0;
+          return false;
+        }
+    }
+
+  return true;
+}
+
 /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
    It corrects the value of COST based on the relationship between
    INSN and DEP through the dependence LINK.  It returns the new
@@ -22722,6 +22760,7 @@
     case cortexa5:
     case cortexa8:
     case cortexa9:
+    case fa726te:
       return 2;
 
     default:
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.md gcc-4.6-svn-20101116/gcc/config/arm/arm.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.md	2010-11-23 13:43:46.939579000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/arm.md	2010-11-23 13:57:43.697327000 +0800
@@ -498,7 +498,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
+          (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -516,6 +516,11 @@
 (include "arm1020e.md")
 (include "arm1026ejs.md")
 (include "arm1136jfs.md")
+(include "fa526.md")
+(include "fa606te.md")
+(include "fa626te.md")
+(include "fmp626.md")
+(include "fa726te.md")
 (include "cortex-a5.md")
 (include "cortex-a8.md")
 (include "cortex-a9.md")
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h	2010-11-23 13:43:46.998580000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h	2010-11-25 17:09:37.545119000 +0800
@@ -52,7 +52,7 @@
 /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
 #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
 
-#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
+#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
 
 #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}"
 
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md	2010-11-23 14:36:17.916371000 +0800
@@ -0,0 +1,161 @@
+;; Faraday FA526 Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 3 (2 cycle penalty)
+;; ALU -> any use: latency = 2 (1 cycle penalty)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA526 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa526_core" "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "526_alu_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu"))
+ "fa526_core")
+
+(define_insn_reservation "526_alu_shift_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa526_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "526_mult1" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy"))
+ "fa526_core")
+
+(define_insn_reservation "526_mult2" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\
+                       umlals,smulls,smlals,smlawx"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "526_load1_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load1,load_byte"))
+ "fa526_core")
+
+(define_insn_reservation "526_load2_op" 4
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_load3_op" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_load4_op" 6
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load4"))
+ "fa526_core*4")
+
+(define_insn_reservation "526_store1_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store1"))
+ "fa526_core")
+
+(define_insn_reservation "526_store2_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_store3_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_store4_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store4"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "526_branch_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "branch"))
+ "fa526_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set 
+;; by a mov instruction, which has 1 cycle latency.
+(define_insn_reservation "526_call_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "call"))
+ "fa526_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md	2010-11-25 17:04:18.693334000 +0800
@@ -0,0 +1,172 @@
+;; Faraday FA606TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA606TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 2 (1 cycle penalty)
+;; ALU -> any use: latency = 1 (0 cycle penalty)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA606TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      E      M      W
+
+(define_cpu_unit "fa606te_core" "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "606te_alu_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "alu,alu_shift,alu_shift_reg"))
+ "fa606te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "606te_mult1" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlalxy"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_mult2" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_mult3" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "mul,mla,muls,mlas"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_mult4" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals"))
+ "fa606te_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "606te_load1_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_load2_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_load3_op" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_load4_op" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load4"))
+ "fa606te_core*4")
+
+(define_insn_reservation "606te_store1_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store1"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_store2_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_store3_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_store4_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store4"))
+ "fa606te_core*4")
+
+
+;;(define_insn_reservation "606te_ldm_op" 9
+;; (and (eq_attr "tune" "fa606te")
+;;      (eq_attr "type" "load2,load3,load4,store2,store3,store4"))
+;; "fa606te_core*7")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycles to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "606te_branch_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "branch"))
+ "fa606te_core")
+
+;; The latency for a call is actually the latency when the result being available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set by a 
+;; mov instruction, which has 1 cycle latency
+(define_insn_reservation "606te_call_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "call"))
+ "fa606te_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md	2010-11-23 14:35:59.928102000 +0800
@@ -0,0 +1,166 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA626TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; Modeled pipeline characteristics:
+;; ALU -> simple address LDR/STR: latency=2 (available after 2 cycles)
+;; ALU -> shifted address LDR/STR: latency=3
+;;		( extra 1 cycle unavoidable stall)
+;; ALU -> other use: latency=2 (available after 2 cycles)
+;; LD  -> simple address LDR/STR: latency=3 (available after 3 cycles)
+;; LD  -> shifted address LDR/STR: latency=4 
+;;		( extra 1 cycle unavoidable stall)
+;; LD  -> any other use: latency = 3 (available after 3 cycles)
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA626TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa626te_core" "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "626te_alu_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_alu_shift_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa626te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "626te_mult1" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult2" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "mul,mla"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult3" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_mult4" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "626te_load1_op" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_load2_op" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load2,load3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_load3_op" 5
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load4"))
+ "fa626te_core*3")
+
+(define_insn_reservation "626te_store1_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store1"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_store2_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store2,store3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_store3_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store4"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "626te_branch_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "branch"))
+ "fa626te_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. 
+(define_insn_reservation "626te_call_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "call"))
+ "fa626te_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md	2010-11-25 17:06:01.877554000 +0800
@@ -0,0 +1,221 @@
+;; Faraday FA726TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA726TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa726te")
+(automata_option "ndfa")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+;;	E1	E2	E3	E4	E5	WB
+;;______________________________________________________
+;;
+;;      <-------------- LD/ST ----------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
+;;______________________________________________________
+;;
+;;      <---------- MUL --------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
+
+
+(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
+(define_cpu_unit "fa726te_mac_pipe" "fa726te")
+(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
+;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
+;; improve code quality
+(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
+(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
+
+(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
+;; reservation which blocks IS
+(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; Move instructions.
+(define_insn_reservation "726te_shift_op" 1
+  (and (eq_attr "tune" "fa726te")
+       (eq_attr "insn" "mov,mvn"))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with no shifted operand will finished in 1 cycle
+;; Other ALU instructions 2 cycles
+(define_insn_reservation "726te_alu_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with a shift-by-register operand
+;; These really stall in the decoder, in order to read
+;; the shift value in a second cycle. Pretend we take two cycles in
+;; the execute stage.
+;; If shift+LU, it takes 2 cycles. If shift+AU, it takes 3 cycles.
+(define_insn_reservation "726te_alu_shift_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+(define_insn_reservation "726te_alu_shift_reg_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift_reg")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times. Multiply operations occur in both the execute and memory
+;; stages of the pipeline
+
+(define_insn_reservation "726te_mult_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
+                       umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
+ "fa726te_issue+fa726te_mac_pipe")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+;; Loads with a shifted offset take 3 cycles, and are (a) probably the
+;; most common and (b) the pessimistic assumption will lead to fewer stalls.
+
+;; Scalar loads are pipelined in FA726TE LSU pipe.
+;; Here we model the resource conflict between Load@E3-stage & Store@W-stage
+;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
+;; same "bundle", the 2nd load will introudce another ISSUE stall but is still
+;; ok to execute (and may be benefical sometimes)
+
+(define_insn_reservation "726te_load1_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load1,load_byte"))
+ "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
+  | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
+
+(define_insn_reservation "726te_store1_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store1"))
+ "fa726te_blockage*2")
+
+;; Load/Store Multiple blocks all pipelines in EX stages until WB 
+;; No other instructions can be issued together.
+;; Since they essentially prevent all scheduling opportunities, we model them
+;; together here.
+
+;; If LDM is breaking into multiple load instructions, later instruction in
+;; pipe 1 is stalled
+(define_insn_reservation "726te_ldm2_op" 4
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load2,load3"))
+ "fa726te_blockage*4")
+
+(define_insn_reservation "726te_ldm3_op" 5
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load4"))
+ "fa726te_blockage*5")
+
+(define_insn_reservation "726te_stm2_op" 2
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store2,store3"))
+ "fa726te_blockage*3")
+
+(define_insn_reservation "726te_stm3_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store4"))
+ "fa726te_blockage*4")
+
+(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
+                  726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
+                 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
+                 "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
+(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
+
+(define_bypass 4 "726te_load1_op" "726te_mult_op")
+(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
+(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "726te_branch_op" 0
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "branch"))
+ "fa726te_blockage")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 is ready for int return value. 
+(define_insn_reservation "726te_call_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "call"))
+ "fa726te_blockage")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md	2010-11-25 17:04:42.990054000 +0800
@@ -0,0 +1,183 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FMP626 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+
+;; Pipeline architecture
+;;	S	E	M	W(Q1)	Q2
+;;   ___________________________________________
+;;    shifter alu    
+;;    mul1    mul2    mul3
+;;    ld/st1  ld/st2  ld/st3  ld/st4  ld/st5
+
+;; This automaton provides a pipeline description for the Faraday
+;; FMP626 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages. We only need to model the execute, memory and write
+;;   stages.
+
+(define_cpu_unit "fmp626_core" "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "mp626_alu_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_alu_shift_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fmp626_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "mp626_mult1" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult2" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "mul,mla"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult3" 3
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_mult4" 4
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fmp626_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "mp626_load1_op" 5
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load1,load_byte"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_load2_op" 6
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load2,load3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_load3_op" 7
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load4"))
+ "fmp626_core*3")
+
+(define_insn_reservation "mp626_store1_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store1"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_store2_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store2,store3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_store3_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store4"))
+ "fmp626_core*3")
+
+(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op"
+                 "mp626_store1_op,mp626_store2_op,mp626_store3_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\
+                  mp626_mult3,mp626_mult4" "mp626_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op")
+(define_bypass 2 "mp626_mult3" "mp626_alu_op")
+(define_bypass 3 "mp626_mult4" "mp626_alu_op")
+(define_bypass 4 "mp626_load1_op" "mp626_alu_op")
+(define_bypass 5 "mp626_load2_op" "mp626_alu_op")
+(define_bypass 6 "mp626_load3_op" "mp626_alu_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The ARM
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "mp626_branch_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "branch"))
+ "fmp626_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.
+(define_insn_reservation "mp626_call_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "call"))
+ "fmp626_core")
+
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm gcc-4.6-svn-20101116/gcc/config/arm/t-arm
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm	2010-11-23 13:43:47.213582000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm	2010-11-23 13:57:43.738329000 +0800
@@ -24,6 +24,11 @@
 		$(srcdir)/config/arm/arm1020e.md \
 		$(srcdir)/config/arm/arm1026ejs.md \
 		$(srcdir)/config/arm/arm1136jfs.md \
+		$(srcdir)/config/arm/fa526.md \
+		$(srcdir)/config/arm/fa606te.md \
+		$(srcdir)/config/arm/fa626te.md \
+		$(srcdir)/config/arm/fmp626.md \
+		$(srcdir)/config/arm/fa726te.md \
 		$(srcdir)/config/arm/arm926ejs.md \
 		$(srcdir)/config/arm/cirrus.md \
 		$(srcdir)/config/arm/fpa.md \
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf	2010-11-23 13:43:47.221580000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf	2010-11-25 16:07:40.603668000 +0800
@@ -36,6 +36,10 @@
 MULTILIB_EXCEPTIONS  = 
 MULTILIB_MATCHES     =
 
+#MULTILIB_OPTIONS     += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te
+#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
+
 #MULTILIB_OPTIONS      += march=armv7
 #MULTILIB_DIRNAMES     += thumb2
 #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
@@ -52,6 +56,8 @@
 MULTILIB_OPTIONS       += mfloat-abi=hard
 MULTILIB_DIRNAMES      += fpu
 MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
 
 # MULTILIB_OPTIONS    += mcpu=ep9312
 # MULTILIB_DIRNAMES   += ep9312
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi
--- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi	2010-11-23 13:43:47.231588000 +0800
+++ gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi	2010-11-25 16:09:09.992215000 +0800
@@ -24,6 +24,10 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
+#MULTILIB_OPTIONS     += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te
+#MULTILIB_EXCEPTIONS  += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te*
+
 # Use a version of div0 which raises SIGFPE, and a special __clear_cache.
 LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache
 
diff -uNr tmp/gcc-4.6-svn-20101116/gcc/doc/invoke.texi gcc-4.6-svn-20101116/gcc/doc/invoke.texi
--- tmp/gcc-4.6-svn-20101116/gcc/doc/invoke.texi	2010-11-23 13:44:15.846799000 +0800
+++ gcc-4.6-svn-20101116/gcc/doc/invoke.texi	2010-11-23 14:33:07.490613000 +0800
@@ -10115,7 +10115,9 @@
 @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-m4}, @samp{cortex-m3},
 @samp{cortex-m1},
 @samp{cortex-m0},
-@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312},
+@samp{fa526}, @samp{fa626},
+@samp{fa606te}, @samp{fa626te}, @samp{fmp626}, @samp{fa726te}.
 
 @item -mtune=@var{name}
 @opindex mtune

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-25 11:31       ` M.F. Wu
@ 2010-11-30 14:51         ` Ramana Radhakrishnan
  2010-12-02  8:27           ` M.F. Wu
  0 siblings, 1 reply; 11+ messages in thread
From: Ramana Radhakrishnan @ 2010-11-30 14:51 UTC (permalink / raw)
  To: M.F. Wu; +Cc: Joseph S. Myers, gcc-patches, toolchain

Hi Mingfeng,

Thanks for making these changes. 

Please do not make the Changelog a part of the final patch. Please make 
this a part of your final mail submission and not a part of your patch.

> @@ -7913,6 +7921,36 @@

<...>

> +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */

Full stop at the end of comment followed by 2 spaces before end of comment.
Can you please audit your patch to check for these issues ? 

This is true for comments in the machine description parts of your patch as well.

> +      if (get_attr_conds(insn)  == CONDS_USE &&
> +          get_attr_type(insn) != TYPE_BRANCH)
> +        {
> +          *cost = 3;
> +          return false;
> +        }

Space between function name and paranthesis. Thus it should be 
get_attr_conds (insn) and not get_attr_conds(insn) as above. 

> +
> +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
> +          || get_attr_conds(insn)  == CONDS_USE)
> +        {
> +          *cost = 0;
> +          return false;
> +        }
> +    }

Likewise.

> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> >> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> >> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
> >> 08:00:00.000000000 +0800
> >> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
> >> 14:36:17.916371000 +0800 
> 
> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> > +;; Branch and Call Instructions
> >> >
> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> > +

> > And equivalently in all the other pipeline descriptions.
> >
> 
> Sorry, I don't understand exactly what you mean...

> +;; Branch instructions are difficult to model accurately.  The ARM
> +;; core can predict most branches.  If the branch is predicted
> 
In the sentence above - 
Replace ARM with FA526 and similarly in the other pipeline descriptions. 

The FA526 core isn't one made by ARM and am not sure if you can use the name
in that regard here :) 

> > ;
> >  
> > +const struct tune_params arm_fa726te_tune =
> > +{
> > +  arm_9e_rtx_costs,
> > +  fa726te_sched_adjust_cost,
> > +  1
> > +};
> > +

This part of your patch is now out-of-date thanks to Ian Bolton's latest commits in that area with respect
to preloads. You might want to consider that in your final submission. I suppose using the defaults
and turning off preloads at O3 would be the correct thing to do to get your patch sheperded through.

> 
> --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
> +++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md	2010-11-25 17:06:01.877554000 +0800
> @@ -0,0 +1,221 @@
> 
> +(define_automaton "fa726te")
> +(automata_option "ndfa")
> +

Why do you have an ndfa option here? Does this give you benefit with benchmarking on the FA726te core since this usually increases compile time
as the automaton ends up searching for all possible options ? 

> +;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
> +;; improve code quality

Full stop at the end of the comment. pretend should start with a capital P and not lower case. (Pretend)
2 spaces between the a full stop or a punctuation character that terminates a sentence and the start of 
the next sentence. There are a number of places in your patch where one can see such cases.

> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")

You have a query_cpu_unit which you don't seem to be querying for in the backend in any form? Is there any thing else
missing in your pipeline description or has this been put in for future use ? 

> +;; reservation which blocks IS
> +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")

Can you clarify the comment above ? Again the comments about sentence case and full stops hold.

cheers
Ramana

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-11-30 14:51         ` Ramana Radhakrishnan
@ 2010-12-02  8:27           ` M.F. Wu
  2010-12-09 14:49             ` Ramana Radhakrishnan
  0 siblings, 1 reply; 11+ messages in thread
From: M.F. Wu @ 2010-12-02  8:27 UTC (permalink / raw)
  To: ramana.radhakrishnan
  Cc: Joseph S. Myers, gcc-patches, toolchain, Richard.Earnshaw,
	Matthew.Gretton-Dann, Leon.Chen

[-- Attachment #1: Type: text/plain, Size: 6258 bytes --]

Dear Ramana,

Thank you for your comments about the patch.
The patch has been modified as the attached
file shows.

The Changlog:

2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
	  Mingfeng Wu  <mingfeng@faraday-tech.com>

	* config/arm/arm-cores.def: Add Faraday CPU support -
	fa526/fa626/fa606te/fa626te/fmp626/fa726te.
	* config/arm/arm-tune.md: Regenerate.
	* config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te.
	(fa726te_sched_adjust_cost): New cost function for fa726te.
	(arm_issue_rate): Add fa726te.
	* config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
	and include machine description files.
	* config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
	* config/arm/t-arm (MD_INCLUDES): Include machine description files for
	Faraday cores.
	* config/arm/t-arm-elf: Add multilib option for Faraday cores.
	* config/arm/t-linux-eabi: Add multilib option for Faraday cores except
	fa526 and fa626.
	* doc/invoke.texi: Document -mcpu for Faraday cores.
	* config/arm/fa526.md: New file.
	* config/arm/fa626.md: New file.
	* config/arm/fa606te.md: New file.
	* config/arm/fa626te.md: New file.
	* config/arm/fmp626.md: New file.
	* config/arm/fa726te.md: New file.



2010/11/30 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> Hi Mingfeng,
>
> Thanks for making these changes.
>
> Please do not make the Changelog a part of the final patch. Please make
> this a part of your final mail submission and not a part of your patch.
>
>
>> @@ -7913,6 +7921,36 @@
>
> <...>
>
>> +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
>
> Full stop at the end of comment followed by 2 spaces before end of comment.
> Can you please audit your patch to check for these issues ?
>
> This is true for comments in the machine description parts of your patch as well.
>

Fixed.

>
>> +      if (get_attr_conds(insn)  == CONDS_USE &&
>> +          get_attr_type(insn) != TYPE_BRANCH)
>> +        {
>> +          *cost = 3;
>> +          return false;
>> +        }
>
> Space between function name and paranthesis. Thus it should be
> get_attr_conds (insn) and not get_attr_conds(insn) as above.
>
>> +
>> +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
>> +          || get_attr_conds(insn)  == CONDS_USE)
>> +        {
>> +          *cost = 0;
>> +          return false;
>> +        }
>> +    }
>
> Likewise.
>

Fixed.

>
>> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
>> >> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
>> >> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
>> >> 08:00:00.000000000 +0800
>> >> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
>> >> 14:36:17.916371000 +0800
>>
>> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> >> > +;; Branch and Call Instructions
>> >> >
>> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> >> > +
>
>> > And equivalently in all the other pipeline descriptions.
>> >
>>
>> Sorry, I don't understand exactly what you mean...
>
>
>> +;; Branch instructions are difficult to model accurately.  The ARM
>> +;; core can predict most branches.  If the branch is predicted
>>
> In the sentence above -
> Replace ARM with FA526 and similarly in the other pipeline descriptions.
>
> The FA526 core isn't one made by ARM and am not sure if you can use the name
> in that regard here :)
>

OK, you are right. I have modified the "ARM" to our core.

>> > ;
>> >
>> > +const struct tune_params arm_fa726te_tune =
>> > +{
>> > +  arm_9e_rtx_costs,
>> > +  fa726te_sched_adjust_cost,
>> > +  1
>> > +};
>> > +
>
> This part of your patch is now out-of-date thanks to Ian Bolton's latest commits in that area with respect
> to preloads. You might want to consider that in your final submission. I suppose using the defaults
> and turning off preloads at O3 would be the correct thing to do to get your patch sheperded through.
>

OK. I modified the arm_fa726te_tune.

const struct tune_params arm_fa726te_tune =
{
  arm_9e_rtx_costs,
  fa726te_sched_adjust_cost,
  1,
  ARM_PREFETCH_NOT_BENEFICIAL
};


>>
>> --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md        1970-01-01 08:00:00.000000000 +0800
>> +++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md    2010-11-25 17:06:01.877554000 +0800
>> @@ -0,0 +1,221 @@
>>
>> +(define_automaton "fa726te")
>> +(automata_option "ndfa")
>> +
>
> Why do you have an ndfa option here? Does this give you benefit with benchmarking on the FA726te core since this usually increases compile time
> as the automaton ends up searching for all possible options ?
>

Yes. the ndfa option does benefit our benchmarking, but a little. So I
remove the ndfa option here.

>
>> +;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
>> +;; improve code quality
>
> Full stop at the end of the comment. pretend should start with a capital P and not lower case. (Pretend)
> 2 spaces between the a full stop or a punctuation character that terminates a sentence and the start of
> the next sentence. There are a number of places in your patch where one can see such cases.
>

Fixed.

>
>> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
>
> You have a query_cpu_unit which you don't seem to be querying for in the backend in any form? Is there any thing else
> missing in your pipeline description or has this been put in for future use ?
>

I only use the units defined by query_cpu_unit in the fa726te.md. The
two units, fa726te_lsu1_pipe_e and
fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
Because fa726te only supports one
ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
for better load instruction scheduling.

>> +;; reservation which blocks IS
>> +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
>
> Can you clarify the comment above ? Again the comments about sentence case and full stops hold.
>

It is used to restrict the instruction issue to one.

>
>
> cheers
> Ramana
>
>
>
>
>
>

[-- Attachment #2: gcc_4.6_svn_167325_faraday_cpu_support.patch --]
[-- Type: application/octet-stream, Size: 46669 bytes --]

diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def	2010-12-02 09:18:17.512519000 +0800
@@ -74,6 +74,8 @@
 ARM_CORE("strongarm110",  strongarm110,	4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1100", strongarm1100, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1110", strongarm1110, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE("fa526",         fa526,        4,                               FL_LDSCHED, fastmul)
+ARM_CORE("fa626",         fa626,        4,                               FL_LDSCHED, fastmul)
 
 /* V4T Architecture Processors */
 ARM_CORE("arm7tdmi",      arm7tdmi,	4T,	FL_CO_PROC          , fastmul)
@@ -104,6 +106,10 @@
 ARM_CORE("xscale",        xscale,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale)
 ARM_CORE("iwmmxt",        iwmmxt,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
 ARM_CORE("iwmmxt2",       iwmmxt2,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
+ARM_CORE("fa606te",       fa606te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa626te",       fa626te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fmp626",        fmp626,       5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa726te",       fa726te,      5TE,                             FL_LDSCHED, fa726te)
 
 /* V5TEJ Architecture Processors */
 ARM_CORE("arm926ej-s",    arm926ejs,	5TEJ,	                         FL_LDSCHED, 9e)
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md	2010-12-02 09:18:17.515528000 +0800
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from arm-cores.def
 (define_attr "tune"
-	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
+	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c	2010-12-02 09:20:29.903984000 +0800
@@ -241,6 +241,7 @@
 static rtx arm_pic_static_addr (rtx orig, rtx reg);
 static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
 static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
 static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
 static bool arm_class_likely_spilled_p (reg_class_t);
 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
@@ -882,6 +883,14 @@
   ARM_PREFETCH_BENEFICIAL(4,32,32)
 };
 
+const struct tune_params arm_fa726te_tune =
+{
+  arm_9e_rtx_costs,
+  fa726te_sched_adjust_cost,
+  1,
+  ARM_PREFETCH_NOT_BENEFICIAL
+};
+
 
 /* Not all of these give usefully different compilation alternatives,
    but there is no simple way of generalizing them.  */
@@ -7968,6 +7977,36 @@
   return true;
 }
 
+/* Adjust cost hook for FA726TE.  */
+static bool
+fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+  /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated)
+     have penalty of 3.  */
+  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
+      && recog_memoized (insn) >= 0
+      && recog_memoized (dep) >= 0
+      && get_attr_conds (dep) == CONDS_SET)
+    {
+      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency.  */
+      if (get_attr_conds (insn) == CONDS_USE
+          && get_attr_type (insn) != TYPE_BRANCH)
+        {
+          *cost = 3;
+          return false;
+        }
+
+      if (GET_CODE (PATTERN (insn)) == COND_EXEC
+          || get_attr_conds (insn) == CONDS_USE)
+        {
+          *cost = 0;
+          return false;
+        }
+    }
+
+  return true;
+}
+
 /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
    It corrects the value of COST based on the relationship between
    INSN and DEP through the dependence LINK.  It returns the new
@@ -22779,6 +22818,7 @@
     case cortexa5:
     case cortexa8:
     case cortexa9:
+    case fa726te:
       return 2;
 
     default:
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md	2010-12-02 09:18:17.656513000 +0800
@@ -498,7 +498,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
+          (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -516,6 +516,11 @@
 (include "arm1020e.md")
 (include "arm1026ejs.md")
 (include "arm1136jfs.md")
+(include "fa526.md")
+(include "fa606te.md")
+(include "fa626te.md")
+(include "fmp626.md")
+(include "fa726te.md")
 (include "cortex-a5.md")
 (include "cortex-a8.md")
 (include "cortex-a9.md")
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h	2010-12-02 09:18:17.660518000 +0800
@@ -52,7 +52,7 @@
 /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
 #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
 
-#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
+#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
 
 #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}"
 
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md	2010-12-02 09:18:17.666513000 +0800
@@ -0,0 +1,161 @@
+;; Faraday FA526 Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
+
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 3 (2 cycle penalty).
+;; ALU -> any use: latency = 2 (1 cycle penalty).
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA526 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa526_core" "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "526_alu_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu"))
+ "fa526_core")
+
+(define_insn_reservation "526_alu_shift_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa526_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "526_mult1" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy"))
+ "fa526_core")
+
+(define_insn_reservation "526_mult2" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\
+                       umlals,smulls,smlals,smlawx"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "526_load1_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load1,load_byte"))
+ "fa526_core")
+
+(define_insn_reservation "526_load2_op" 4
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_load3_op" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_load4_op" 6
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load4"))
+ "fa526_core*4")
+
+(define_insn_reservation "526_store1_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store1"))
+ "fa526_core")
+
+(define_insn_reservation "526_store2_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_store3_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_store4_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store4"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA526
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "526_branch_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "branch"))
+ "fa526_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.  For most cases, the return value is set
+;; by a mov instruction, which has 1 cycle latency.
+(define_insn_reservation "526_call_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "call"))
+ "fa526_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	2010-12-02 09:18:17.671515000 +0800
@@ -0,0 +1,171 @@
+;; Faraday FA606TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA606TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 2 (1 cycle penalty).
+;; ALU -> any use: latency = 1 (0 cycle penalty).
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA606TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;      E      M      W
+
+(define_cpu_unit "fa606te_core" "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "606te_alu_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "alu,alu_shift,alu_shift_reg"))
+ "fa606te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "606te_mult1" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlalxy"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_mult2" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_mult3" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "mul,mla,muls,mlas"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_mult4" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals"))
+ "fa606te_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "606te_load1_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_load2_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_load3_op" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_load4_op" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load4"))
+ "fa606te_core*4")
+
+(define_insn_reservation "606te_store1_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store1"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_store2_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_store3_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_store4_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store4"))
+ "fa606te_core*4")
+
+
+;;(define_insn_reservation "606te_ldm_op" 9
+;; (and (eq_attr "tune" "fa606te")
+;;      (eq_attr "type" "load2,load3,load4,store2,store3,store4"))
+;; "fa606te_core*7")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA606TE
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycles to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "606te_branch_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "branch"))
+ "fa606te_core")
+
+;; The latency for a call is actually the latency when the result being available.
+;; i.e. R0 ready for int return value. For most cases, the return value is set by a
+;; mov instruction, which has 1 cycle latency.
+(define_insn_reservation "606te_call_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "call"))
+ "fa606te_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md	2010-12-02 09:18:17.676517000 +0800
@@ -0,0 +1,165 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA626TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; Modeled pipeline characteristics:
+;; ALU -> simple address LDR/STR: latency = 2 (available after 2 cycles).
+;; ALU -> shifted address LDR/STR: latency = 3.
+;;		( extra 1 cycle unavoidable stall).
+;; ALU -> other use: latency = 2 (available after 2 cycles).
+;; LD  -> simple address LDR/STR: latency = 3 (available after 3 cycles).
+;; LD  -> shifted address LDR/STR: latency = 4
+;;		( extra 1 cycle unavoidable stall).
+;; LD  -> any other use: latency = 3 (available after 3 cycles).
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA626TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa626te_core" "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "626te_alu_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_alu_shift_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa626te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "626te_mult1" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult2" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "mul,mla"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult3" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_mult4" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "626te_load1_op" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_load2_op" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load2,load3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_load3_op" 5
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load4"))
+ "fa626te_core*3")
+
+(define_insn_reservation "626te_store1_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store1"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_store2_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store2,store3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_store3_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store4"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA626TE
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "626te_branch_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "branch"))
+ "fa626te_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. 
+(define_insn_reservation "626te_call_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "call"))
+ "fa626te_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md	2010-12-02 14:45:23.731365000 +0800
@@ -0,0 +1,217 @@
+;; Faraday FA726TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA726TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa726te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;	E1	E2	E3	E4	E5	WB
+;;______________________________________________________
+;;
+;;      <-------------- LD/ST ----------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
+;;______________________________________________________
+;;
+;;      <---------- MUL --------->
+;;    shifter + LU      <-- AU --> 
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
+
+
+(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
+(define_cpu_unit "fa726te_mac_pipe" "fa726te")
+(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
+
+;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
+;; improve code quality.
+(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
+(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
+
+(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
+(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; Move instructions.
+(define_insn_reservation "726te_shift_op" 1
+  (and (eq_attr "tune" "fa726te")
+       (eq_attr "insn" "mov,mvn"))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with no shifted operand will finished in 1 cycle
+;; Other ALU instructions 2 cycles.
+(define_insn_reservation "726te_alu_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with a shift-by-register operand.
+;; These really stall in the decoder, in order to read the shift value
+;; in the first cycle.  If the instruction uses both shifter and AU,
+;; it takes 3 cycles.
+(define_insn_reservation "726te_alu_shift_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+(define_insn_reservation "726te_alu_shift_reg_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift_reg")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times.  Multiply operations occur in both the execute and memory
+;; stages of the pipeline
+
+(define_insn_reservation "726te_mult_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
+                       umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
+ "fa726te_issue+fa726te_mac_pipe")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+;; Loads with a shifted offset take 3 cycles, and are (a) probably the
+;; most common and (b) the pessimistic assumption will lead to fewer stalls.
+
+;; Scalar loads are pipelined in FA726TE LSU pipe.
+;; Here we model the resource conflict between Load@E3-stage & Store@W-stage.
+;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
+;; same "bundle", and the 2nd load will introudce another ISSUE stall but is
+;; still ok to execute (and may be benefical sometimes).
+
+(define_insn_reservation "726te_load1_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load1,load_byte"))
+ "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
+  | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
+
+(define_insn_reservation "726te_store1_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store1"))
+ "fa726te_blockage*2")
+
+;; Load/Store Multiple blocks all pipelines in EX stages until WB.
+;; No other instructions can be issued together.  Since they essentially
+;; prevent all scheduling opportunities, we model them together here.
+
+;; The LDM is breaking into multiple load instructions, later instruction in
+;; the pipe 1 is stalled.
+(define_insn_reservation "726te_ldm2_op" 4
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load2,load3"))
+ "fa726te_blockage*4")
+
+(define_insn_reservation "726te_ldm3_op" 5
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load4"))
+ "fa726te_blockage*5")
+
+(define_insn_reservation "726te_stm2_op" 2
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store2,store3"))
+ "fa726te_blockage*3")
+
+(define_insn_reservation "726te_stm3_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store4"))
+ "fa726te_blockage*4")
+
+(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
+                  726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
+                 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
+                 "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
+                 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
+(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
+
+(define_bypass 4 "726te_load1_op" "726te_mult_op")
+(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
+(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA726TE
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "726te_branch_op" 0
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "branch"))
+ "fa726te_blockage")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 is ready for int return value. 
+(define_insn_reservation "726te_call_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "call"))
+ "fa726te_blockage")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md	2010-12-02 09:18:17.687514000 +0800
@@ -0,0 +1,182 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FMP626 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; Pipeline architecture
+;;	S	E	M	W(Q1)	Q2
+;;   ___________________________________________
+;;    shifter alu    
+;;    mul1    mul2    mul3
+;;    ld/st1  ld/st2  ld/st3  ld/st4  ld/st5
+
+;; This automaton provides a pipeline description for the Faraday
+;; FMP626 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+(define_cpu_unit "fmp626_core" "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "mp626_alu_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_alu_shift_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fmp626_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "mp626_mult1" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult2" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "mul,mla"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult3" 3
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_mult4" 4
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fmp626_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "mp626_load1_op" 5
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load1,load_byte"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_load2_op" 6
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load2,load3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_load3_op" 7
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load4"))
+ "fmp626_core*3")
+
+(define_insn_reservation "mp626_store1_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store1"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_store2_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store2,store3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_store3_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store4"))
+ "fmp626_core*3")
+
+(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op"
+                 "mp626_store1_op,mp626_store2_op,mp626_store3_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\
+                  mp626_mult3,mp626_mult4" "mp626_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op")
+(define_bypass 2 "mp626_mult3" "mp626_alu_op")
+(define_bypass 3 "mp626_mult4" "mp626_alu_op")
+(define_bypass 4 "mp626_load1_op" "mp626_alu_op")
+(define_bypass 5 "mp626_load2_op" "mp626_alu_op")
+(define_bypass 6 "mp626_load3_op" "mp626_alu_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FMP626
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "mp626_branch_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "branch"))
+ "fmp626_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.
+(define_insn_reservation "mp626_call_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "call"))
+ "fmp626_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm	2010-12-02 09:18:17.689525000 +0800
@@ -24,6 +24,11 @@
 		$(srcdir)/config/arm/arm1020e.md \
 		$(srcdir)/config/arm/arm1026ejs.md \
 		$(srcdir)/config/arm/arm1136jfs.md \
+		$(srcdir)/config/arm/fa526.md \
+		$(srcdir)/config/arm/fa606te.md \
+		$(srcdir)/config/arm/fa626te.md \
+		$(srcdir)/config/arm/fmp626.md \
+		$(srcdir)/config/arm/fa726te.md \
 		$(srcdir)/config/arm/arm926ejs.md \
 		$(srcdir)/config/arm/cirrus.md \
 		$(srcdir)/config/arm/fpa.md \
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf	2010-12-02 15:43:26.302446000 +0800
@@ -36,6 +36,10 @@
 MULTILIB_EXCEPTIONS  = 
 MULTILIB_MATCHES     =
 
+#MULTILIB_OPTIONS     += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te
+#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
+
 #MULTILIB_OPTIONS      += march=armv7
 #MULTILIB_DIRNAMES     += thumb2
 #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
@@ -52,6 +56,8 @@
 MULTILIB_OPTIONS       += mfloat-abi=hard
 MULTILIB_DIRNAMES      += fpu
 MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
 
 # MULTILIB_OPTIONS    += mcpu=ep9312
 # MULTILIB_DIRNAMES   += ep9312
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi	2010-12-02 09:18:17.697515000 +0800
@@ -24,6 +24,10 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
+#MULTILIB_OPTIONS     += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te
+#MULTILIB_EXCEPTIONS  += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te*
+
 # Use a version of div0 which raises SIGFPE, and a special __clear_cache.
 LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache
 
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi
--- tmp/gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi	2010-12-01 18:34:13.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi	2010-12-02 09:18:17.785515000 +0800
@@ -10110,7 +10110,9 @@
 @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-m4}, @samp{cortex-m3},
 @samp{cortex-m1},
 @samp{cortex-m0},
-@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312},
+@samp{fa526}, @samp{fa626},
+@samp{fa606te}, @samp{fa626te}, @samp{fmp626}, @samp{fa726te}.
 
 @item -mtune=@var{name}
 @opindex mtune

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-12-02  8:27           ` M.F. Wu
@ 2010-12-09 14:49             ` Ramana Radhakrishnan
  2010-12-13 11:01               ` M.F. Wu
  0 siblings, 1 reply; 11+ messages in thread
From: Ramana Radhakrishnan @ 2010-12-09 14:49 UTC (permalink / raw)
  To: M.F. Wu
  Cc: Joseph S. Myers, gcc-patches, toolchain, Richard.Earnshaw,
	Matthew.Gretton-Dann, Leon.Chen

Hi Mingfeng

Sorry about the late response and thanks for working through the issues. 
I've been off sick and only got back to looking at this today.

Some minor nits in your changelog.


> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
> 	  Mingfeng Wu  <mingfeng@faraday-tech.com>
> 
> 	* config/arm/arm-cores.def: Add Faraday CPU support -
> 	fa526/fa626/fa606te/fa626te/fmp626/fa726te.

',' instead of '/' in the changelog entry. fa526, fa626 etc.

> 	* config/arm/arm-tune.md: Regenerate.
> 	* config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te

It's enough to say New.
	
> 	(fa726te_sched_adjust_cost): New cost function for fa726te.

Enough to say New.

> 	(arm_issue_rate): Add fa726te.

s/Add/Handle

> 	* config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
> 	and include machine description files.

Replace sentence with :
Don't use Generic scheduler for Faraday cores. 


> 	* config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.

s/Add/Handle


> >
> >> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w"
> "fa726te")
> >
> > You have a query_cpu_unit which you don't seem to be querying for in
> the backend in any form? Is there any thing else
> > missing in your pipeline description or has this been put in for
> future use ?
> >
> 
> I only use the units defined by query_cpu_unit in the fa726te.md. The
> two units, fa726te_lsu1_pipe_e and
> fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
> Because fa726te only supports one
> ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
> for better load instruction scheduling.

The reason I asked why you used the define_cpu_unit vs
define_query_cpu_unit was because there was no backend hook that queried
for the cpu unit in question. IIRC define_cpu_unit and
define_query_cpu_unit are more or less identical except for the fact
that you can result in better minimization in one case than the other. 

However in this case the difference in the size of the automaton should
be minimal considering it isn't a very big automaton in question. 

> 
> >> +;; reservation which blocks IS
> >> +(define_reservation "fa726te_blockage" "(fa726te_is0
> +fa726te_is1)")
> >
> > Can you clarify the comment above ? Again the comments about
> sentence case and full stops hold.

> 
> 
> It is used to restrict the instruction issue to one.


Ok so make that explicit in the comment. Something like: 

" ;Reservation to restrict issue to 1.



Now on to your latest patch submission.



Still a few formatting issues from your latest patch.

 
> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	2010-12-02 09:18:17.671515000 +0800
> 
<...>

> 
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;; Branch and Call Instructions
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +;; Branch instructions are difficult to model accurately.  The FA606TE
> +;; core can predict most branches.  If the branch is predicted
> +;; correctly, and predicted early enough, the branch can be completely
> +;; eliminated from the instruction stream.  Some branches can
> +;; therefore appear to require zero cycles to execute.  We assume that
> +;; all branches are predicted correctly, and that the latency is
> +;; therefore the minimum value.
> +
> +(define_insn_reservation "606te_branch_op" 0
> + (and (eq_attr "tune" "fa606te")
> +      (eq_attr "type" "branch"))
> + "fa606te_core")
> +
> +;; The latency for a call is actually the latency when the result being available.
> +;; i.e. R0 ready for int return value. For most cases, the return value is set by a
					^^^
					2 spaces between '.' and start of next sentence.

> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	2010-12-02 09:18:17.671515000 +0800
> @@ -0,0 +1,171 @@
> 
> +;; The latency for a call is actually the latency when the result is available.
> +;; i.e. R0 ready for int return value. 
					 ^	

Very small nit. Remove trailing white-space. 

> 
> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md
> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md	2010-12-02 14:45:23.731365000 +0800

<snip...>

> 
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;; Pipelines
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +;;   The ALU pipeline has fetch, decode, execute, memory, and
> +;;   write stages.  We only need to model the execute, memory and write
> +;;   stages.
> +
> +;;	E1	E2	E3	E4	E5	WB
> +;;______________________________________________________
> +;;
> +;;      <-------------- LD/ST ----------->
> +;;    shifter + LU      <-- AU --> 
				     ^ Trailing whitespace.

> +;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
> +;;______________________________________________________
> +;;
> +;;      <---------- MUL --------->
> +;;    shifter + LU      <-- AU --> 
				     ^ Trailing whitespace.


From fa726te.md 

> +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
									     ^
Trailing whitespace.

> +                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
> +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 
									     ^
Trailing whitespace.


> +;; The latency for a call is actually the latency when the result is available.
> +;; i.e. R0 is ready for int return value. 

Likewise.


> 
> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md
> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md	1970-01-01 08:00:00.000000000 +0800
> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md	2010-12-02 09:18:17.687514000 +0800
<snip>

> +;; Pipeline architecture
> +;;	S	E	M	W(Q1)	Q2
> +;;   ___________________________________________
> +;;    shifter alu    
		    ^^^ Multiple trailing whitespaces. 



> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h
> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h	2010-12-01 18:48:48.000000000 +0800
> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h	2010-12-02 09:18:17.660518000 +0800
> @@ -52,7 +52,7 @@
>  /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
>  #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
>  
> -#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
> +#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
> 

Exceeds the 80 character per line limit.  Can you split this across multiple lines ?

Something like
#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*\
|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"

should do the trick.

However this isn't an approval since I can't approve or reject your patch.


cheers
Ramana



On Thu, 2010-12-02 at 16:27 +0800, M.F. Wu wrote: 
> Dear Ramana,
> 
> Thank you for your comments about the patch.
> The patch has been modified as the attached
> file shows.
> 
> The Changlog:
> 
> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
> 	  Mingfeng Wu  <mingfeng@faraday-tech.com>
> 
> 	* config/arm/arm-cores.def: Add Faraday CPU support -
> 	fa526/fa626/fa606te/fa626te/fmp626/fa726te.
> 	* config/arm/arm-tune.md: Regenerate.
> 	* config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te
> 	(fa726te_sched_adjust_cost): New cost function for fa726te.
> 	(arm_issue_rate): Add fa726te.
> 	* config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
> 	and include machine description files.
> 	* config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
> 	* config/arm/t-arm (MD_INCLUDES): Include machine description files for
> 	Faraday cores.
> 	* config/arm/t-arm-elf: Add multilib option for Faraday cores.
> 	* config/arm/t-linux-eabi: Add multilib option for Faraday cores except
> 	fa526 and fa626.
> 	* doc/invoke.texi: Document -mcpu for Faraday cores.
> 	* config/arm/fa526.md: New file.
> 	* config/arm/fa626.md: New file.
> 	* config/arm/fa606te.md: New file.
> 	* config/arm/fa626te.md: New file.
> 	* config/arm/fmp626.md: New file.
> 	* config/arm/fa726te.md: New file.
> 
> 
> 
> 2010/11/30 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> > Hi Mingfeng,
> >
> > Thanks for making these changes.
> >
> > Please do not make the Changelog a part of the final patch. Please make
> > this a part of your final mail submission and not a part of your patch.
> >
> >
> >> @@ -7913,6 +7921,36 @@
> >
> > <...>
> >
> >> +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
> >
> > Full stop at the end of comment followed by 2 spaces before end of comment.
> > Can you please audit your patch to check for these issues ?
> >
> > This is true for comments in the machine description parts of your patch as well.
> >
> 
> Fixed.
> 
> >
> >> +      if (get_attr_conds(insn)  == CONDS_USE &&
> >> +          get_attr_type(insn) != TYPE_BRANCH)
> >> +        {
> >> +          *cost = 3;
> >> +          return false;
> >> +        }
> >
> > Space between function name and paranthesis. Thus it should be
> > get_attr_conds (insn) and not get_attr_conds(insn) as above.
> >
> >> +
> >> +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
> >> +          || get_attr_conds(insn)  == CONDS_USE)
> >> +        {
> >> +          *cost = 0;
> >> +          return false;
> >> +        }
> >> +    }
> >
> > Likewise.
> >
> 
> Fixed.
> 
> >
> >> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> >> >> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> >> >> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
> >> >> 08:00:00.000000000 +0800
> >> >> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
> >> >> 14:36:17.916371000 +0800
> >>
> >> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> >> > +;; Branch and Call Instructions
> >> >> >
> >> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> >> > +
> >
> >> > And equivalently in all the other pipeline descriptions.
> >> >
> >>
> >> Sorry, I don't understand exactly what you mean...
> >
> >
> >> +;; Branch instructions are difficult to model accurately.  The ARM
> >> +;; core can predict most branches.  If the branch is predicted
> >>
> > In the sentence above -
> > Replace ARM with FA526 and similarly in the other pipeline descriptions.
> >
> > The FA526 core isn't one made by ARM and am not sure if you can use the name
> > in that regard here :)
> >
> 
> OK, you are right. I have modified the "ARM" to our core.
> 
> >> > ;
> >> >
> >> > +const struct tune_params arm_fa726te_tune =
> >> > +{
> >> > +  arm_9e_rtx_costs,
> >> > +  fa726te_sched_adjust_cost,
> >> > +  1
> >> > +};
> >> > +
> >
> > This part of your patch is now out-of-date thanks to Ian Bolton's latest commits in that area with respect
> > to preloads. You might want to consider that in your final submission. I suppose using the defaults
> > and turning off preloads at O3 would be the correct thing to do to get your patch sheperded through.
> >
> 
> OK. I modified the arm_fa726te_tune.
> 
> const struct tune_params arm_fa726te_tune =
> {
>   arm_9e_rtx_costs,
>   fa726te_sched_adjust_cost,
>   1,
>   ARM_PREFETCH_NOT_BENEFICIAL
> };
> 
> 
> >>
> >> --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md        1970-01-01 08:00:00.000000000 +0800
> >> +++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md    2010-11-25 17:06:01.877554000 +0800
> >> @@ -0,0 +1,221 @@
> >>
> >> +(define_automaton "fa726te")
> >> +(automata_option "ndfa")
> >> +
> >
> > Why do you have an ndfa option here? Does this give you benefit with benchmarking on the FA726te core since this usually increases compile time
> > as the automaton ends up searching for all possible options ?
> >
> 
> Yes. the ndfa option does benefit our benchmarking, but a little. So I
> remove the ndfa option here.
> 
> >
> >> +;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
> >> +;; improve code quality
> >
> > Full stop at the end of the comment. pretend should start with a capital P and not lower case. (Pretend)
> > 2 spaces between the a full stop or a punctuation character that terminates a sentence and the start of
> > the next sentence. There are a number of places in your patch where one can see such cases.
> >
> 
> Fixed.
> 
> >
> >> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
> >
> > You have a query_cpu_unit which you don't seem to be querying for in the backend in any form? Is there any thing else
> > missing in your pipeline description or has this been put in for future use ?
> >
> 
> I only use the units defined by query_cpu_unit in the fa726te.md. The
> two units, fa726te_lsu1_pipe_e and
> fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
> Because fa726te only supports one
> ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
> for better load instruction scheduling.
> 
> >> +;; reservation which blocks IS
> >> +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
> >
> > Can you clarify the comment above ? Again the comments about sentence case and full stops hold.
> >
> 
> It is used to restrict the instruction issue to one.
> 
> >
> >
> > cheers
> > Ramana
> >
> >
> >
> >
> >
> >


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-12-09 14:49             ` Ramana Radhakrishnan
@ 2010-12-13 11:01               ` M.F. Wu
  2010-12-20 18:48                 ` Richard Earnshaw
  0 siblings, 1 reply; 11+ messages in thread
From: M.F. Wu @ 2010-12-13 11:01 UTC (permalink / raw)
  To: ramana.radhakrishnan
  Cc: Joseph S. Myers, gcc-patches, toolchain, Richard.Earnshaw,
	Matthew.Gretton-Dann, Leon.Chen

[-- Attachment #1: Type: text/plain, Size: 16873 bytes --]

Dear Ramana,

Thank you for your kindly help. I have modified according to
your suggestion. The following is my Changlog.

2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
	    Mingfeng Wu  <mingfeng@faraday-tech.com>

	* config/arm/arm-cores.def: Add Faraday CPU support -
	fa526,fa626,fa606te,fa626te,fmp626,fa726te.
	* config/arm/arm-tune.md: Regenerate.
	* config/arm/arm.c (arm_fa726te_tune): New.
	(fa726te_sched_adjust_cost): New.
	(arm_issue_rate): Handle fa726te.
	* config/arm/arm.md (generic_sched): Don't use Generic scheduler for
	Faraday cores.
	* config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Handle fa526 and fa626.
	* config/arm/t-arm (MD_INCLUDES): Include machine description files for
	Faraday cores.
	* config/arm/t-arm-elf: Add multilib option for Faraday cores.
	* config/arm/t-linux-eabi: Add multilib option for Faraday cores except
	fa526 and fa626.
	* doc/invoke.texi: Document -mcpu for Faraday cores.
	* config/arm/fa526.md: New file.
	* config/arm/fa626.md: New file.
	* config/arm/fa606te.md: New file.
	* config/arm/fa626te.md: New file.
	* config/arm/fmp626.md: New file.
	* config/arm/fa726te.md: New file.



2010/12/9 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> Hi Mingfeng
>
> Sorry about the late response and thanks for working through the issues.
> I've been off sick and only got back to looking at this today.
>
> Some minor nits in your changelog.
>
>
>> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
>>         Mingfeng Wu  <mingfeng@faraday-tech.com>
>>
>>       * config/arm/arm-cores.def: Add Faraday CPU support -
>>       fa526/fa626/fa606te/fa626te/fmp626/fa726te.
>
> ',' instead of '/' in the changelog entry. fa526, fa626 etc.
>
>>       * config/arm/arm-tune.md: Regenerate.
>>       * config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te
>
> It's enough to say New.
>

OK.

>>       (fa726te_sched_adjust_cost): New cost function for fa726te.
>
> Enough to say New.
>

OK.

>>       (arm_issue_rate): Add fa726te.
>
> s/Add/Handle
>

OK.

>>       * config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
>>       and include machine description files.
>
> Replace sentence with :
> Don't use Generic scheduler for Faraday cores.
>

Replaced.

>
>>       * config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
>
> s/Add/Handle
>

OK.

>
>> >
>> >> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w"
>> "fa726te")
>> >
>> > You have a query_cpu_unit which you don't seem to be querying for in
>> the backend in any form? Is there any thing else
>> > missing in your pipeline description or has this been put in for
>> future use ?
>> >
>>
>> I only use the units defined by query_cpu_unit in the fa726te.md. The
>> two units, fa726te_lsu1_pipe_e and
>> fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
>> Because fa726te only supports one
>> ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
>> for better load instruction scheduling.
>
> The reason I asked why you used the define_cpu_unit vs
> define_query_cpu_unit was because there was no backend hook that queried
> for the cpu unit in question. IIRC define_cpu_unit and
> define_query_cpu_unit are more or less identical except for the fact
> that you can result in better minimization in one case than the other.
>
> However in this case the difference in the size of the automaton should
> be minimal considering it isn't a very big automaton in question.
>
>>
>> >> +;; reservation which blocks IS
>> >> +(define_reservation "fa726te_blockage" "(fa726te_is0
>> +fa726te_is1)")
>> >
>> > Can you clarify the comment above ? Again the comments about
>> sentence case and full stops hold.
>
>>
>>
>> It is used to restrict the instruction issue to one.
>
>
> Ok so make that explicit in the comment. Something like:
>
> " ;Reservation to restrict issue to 1.
>

Modified as suggested.

>
>
> Now on to your latest patch submission.
>
>
>
> Still a few formatting issues from your latest patch.
>
>
>> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
>> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md 1970-01-01 08:00:00.000000000 +0800
>> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md     2010-12-02 09:18:17.671515000 +0800
>>
> <...>
>
>>
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> +;; Branch and Call Instructions
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> +
>> +;; Branch instructions are difficult to model accurately.  The FA606TE
>> +;; core can predict most branches.  If the branch is predicted
>> +;; correctly, and predicted early enough, the branch can be completely
>> +;; eliminated from the instruction stream.  Some branches can
>> +;; therefore appear to require zero cycles to execute.  We assume that
>> +;; all branches are predicted correctly, and that the latency is
>> +;; therefore the minimum value.
>> +
>> +(define_insn_reservation "606te_branch_op" 0
>> + (and (eq_attr "tune" "fa606te")
>> +      (eq_attr "type" "branch"))
>> + "fa606te_core")
>> +
>> +;; The latency for a call is actually the latency when the result being available.
>> +;; i.e. R0 ready for int return value. For most cases, the return value is set by a
>                                        ^^^
>                                        2 spaces between '.' and start of next sentence.
>

OK.

>> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
>> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md 1970-01-01 08:00:00.000000000 +0800
>> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md     2010-12-02 09:18:17.671515000 +0800
>> @@ -0,0 +1,171 @@
>>
>> +;; The latency for a call is actually the latency when the result is available.
>> +;; i.e. R0 ready for int return value.
>                                         ^
>
> Very small nit. Remove trailing white-space.
>

OK.

>>
>> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md
>> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md 1970-01-01 08:00:00.000000000 +0800
>> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md     2010-12-02 14:45:23.731365000 +0800
>
> <snip...>
>
>>
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> +;; Pipelines
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> +
>> +;;   The ALU pipeline has fetch, decode, execute, memory, and
>> +;;   write stages.  We only need to model the execute, memory and write
>> +;;   stages.
>> +
>> +;;   E1      E2      E3      E4      E5      WB
>> +;;______________________________________________________
>> +;;
>> +;;      <-------------- LD/ST ----------->
>> +;;    shifter + LU      <-- AU -->
>                                     ^ Trailing whitespace.
>

OK.

>> +;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
>> +;;______________________________________________________
>> +;;
>> +;;      <---------- MUL --------->
>> +;;    shifter + LU      <-- AU -->
>                                     ^ Trailing whitespace.
>

OK.

>
> >From fa726te.md
>
>> +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
>                                                                             ^
> Trailing whitespace.
>
>> +                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
>> +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
>                                                                             ^
> Trailing whitespace.
>

OK.

>
>> +;; The latency for a call is actually the latency when the result is available.
>> +;; i.e. R0 is ready for int return value.
>
> Likewise.
>

OK.

>
>>
>> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md
>> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md  1970-01-01 08:00:00.000000000 +0800
>> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md      2010-12-02 09:18:17.687514000 +0800
> <snip>
>
>> +;; Pipeline architecture
>> +;;   S       E       M       W(Q1)   Q2
>> +;;   ___________________________________________
>> +;;    shifter alu
>                    ^^^ Multiple trailing whitespaces.
>

OK.

>
>
>> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h
>> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h    2010-12-01 18:48:48.000000000 +0800
>> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h        2010-12-02 09:18:17.660518000 +0800
>> @@ -52,7 +52,7 @@
>>  /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
>>  #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
>>
>> -#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
>> +#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
>>
>
> Exceeds the 80 character per line limit.  Can you split this across multiple lines ?
>
> Something like
> #define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*\
> |march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
>
> should do the trick.
>

OK. Split it into two lines.

> However this isn't an approval since I can't approve or reject your patch.
>
>
> cheers
> Ramana
>
>
>
> On Thu, 2010-12-02 at 16:27 +0800, M.F. Wu wrote:
>> Dear Ramana,
>>
>> Thank you for your comments about the patch.
>> The patch has been modified as the attached
>> file shows.
>>
>> The Changlog:
>>
>> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
>>         Mingfeng Wu  <mingfeng@faraday-tech.com>
>>
>>       * config/arm/arm-cores.def: Add Faraday CPU support -
>>       fa526/fa626/fa606te/fa626te/fmp626/fa726te.
>>       * config/arm/arm-tune.md: Regenerate.
>>       * config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te
>>       (fa726te_sched_adjust_cost): New cost function for fa726te.
>>       (arm_issue_rate): Add fa726te.
>>       * config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
>>       and include machine description files.
>>       * config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
>>       * config/arm/t-arm (MD_INCLUDES): Include machine description files for
>>       Faraday cores.
>>       * config/arm/t-arm-elf: Add multilib option for Faraday cores.
>>       * config/arm/t-linux-eabi: Add multilib option for Faraday cores except
>>       fa526 and fa626.
>>       * doc/invoke.texi: Document -mcpu for Faraday cores.
>>       * config/arm/fa526.md: New file.
>>       * config/arm/fa626.md: New file.
>>       * config/arm/fa606te.md: New file.
>>       * config/arm/fa626te.md: New file.
>>       * config/arm/fmp626.md: New file.
>>       * config/arm/fa726te.md: New file.
>>
>>
>>
>> 2010/11/30 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
>> > Hi Mingfeng,
>> >
>> > Thanks for making these changes.
>> >
>> > Please do not make the Changelog a part of the final patch. Please make
>> > this a part of your final mail submission and not a part of your patch.
>> >
>> >
>> >> @@ -7913,6 +7921,36 @@
>> >
>> > <...>
>> >
>> >> +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
>> >
>> > Full stop at the end of comment followed by 2 spaces before end of comment.
>> > Can you please audit your patch to check for these issues ?
>> >
>> > This is true for comments in the machine description parts of your patch as well.
>> >
>>
>> Fixed.
>>
>> >
>> >> +      if (get_attr_conds(insn)  == CONDS_USE &&
>> >> +          get_attr_type(insn) != TYPE_BRANCH)
>> >> +        {
>> >> +          *cost = 3;
>> >> +          return false;
>> >> +        }
>> >
>> > Space between function name and paranthesis. Thus it should be
>> > get_attr_conds (insn) and not get_attr_conds(insn) as above.
>> >
>> >> +
>> >> +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
>> >> +          || get_attr_conds(insn)  == CONDS_USE)
>> >> +        {
>> >> +          *cost = 0;
>> >> +          return false;
>> >> +        }
>> >> +    }
>> >
>> > Likewise.
>> >
>>
>> Fixed.
>>
>> >
>> >> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
>> >> >> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
>> >> >> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
>> >> >> 08:00:00.000000000 +0800
>> >> >> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
>> >> >> 14:36:17.916371000 +0800
>> >>
>> >> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> >> >> > +;; Branch and Call Instructions
>> >> >> >
>> >> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>> >> >> > +
>> >
>> >> > And equivalently in all the other pipeline descriptions.
>> >> >
>> >>
>> >> Sorry, I don't understand exactly what you mean...
>> >
>> >
>> >> +;; Branch instructions are difficult to model accurately.  The ARM
>> >> +;; core can predict most branches.  If the branch is predicted
>> >>
>> > In the sentence above -
>> > Replace ARM with FA526 and similarly in the other pipeline descriptions.
>> >
>> > The FA526 core isn't one made by ARM and am not sure if you can use the name
>> > in that regard here :)
>> >
>>
>> OK, you are right. I have modified the "ARM" to our core.
>>
>> >> > ;
>> >> >
>> >> > +const struct tune_params arm_fa726te_tune =
>> >> > +{
>> >> > +  arm_9e_rtx_costs,
>> >> > +  fa726te_sched_adjust_cost,
>> >> > +  1
>> >> > +};
>> >> > +
>> >
>> > This part of your patch is now out-of-date thanks to Ian Bolton's latest commits in that area with respect
>> > to preloads. You might want to consider that in your final submission. I suppose using the defaults
>> > and turning off preloads at O3 would be the correct thing to do to get your patch sheperded through.
>> >
>>
>> OK. I modified the arm_fa726te_tune.
>>
>> const struct tune_params arm_fa726te_tune =
>> {
>>   arm_9e_rtx_costs,
>>   fa726te_sched_adjust_cost,
>>   1,
>>   ARM_PREFETCH_NOT_BENEFICIAL
>> };
>>
>>
>> >>
>> >> --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md        1970-01-01 08:00:00.000000000 +0800
>> >> +++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md    2010-11-25 17:06:01.877554000 +0800
>> >> @@ -0,0 +1,221 @@
>> >>
>> >> +(define_automaton "fa726te")
>> >> +(automata_option "ndfa")
>> >> +
>> >
>> > Why do you have an ndfa option here? Does this give you benefit with benchmarking on the FA726te core since this usually increases compile time
>> > as the automaton ends up searching for all possible options ?
>> >
>>
>> Yes. the ndfa option does benefit our benchmarking, but a little. So I
>> remove the ndfa option here.
>>
>> >
>> >> +;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
>> >> +;; improve code quality
>> >
>> > Full stop at the end of the comment. pretend should start with a capital P and not lower case. (Pretend)
>> > 2 spaces between the a full stop or a punctuation character that terminates a sentence and the start of
>> > the next sentence. There are a number of places in your patch where one can see such cases.
>> >
>>
>> Fixed.
>>
>> >
>> >> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
>> >
>> > You have a query_cpu_unit which you don't seem to be querying for in the backend in any form? Is there any thing else
>> > missing in your pipeline description or has this been put in for future use ?
>> >
>>
>> I only use the units defined by query_cpu_unit in the fa726te.md. The
>> two units, fa726te_lsu1_pipe_e and
>> fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
>> Because fa726te only supports one
>> ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
>> for better load instruction scheduling.
>>
>> >> +;; reservation which blocks IS
>> >> +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
>> >
>> > Can you clarify the comment above ? Again the comments about sentence case and full stops hold.
>> >
>>
>> It is used to restrict the instruction issue to one.
>>
>> >
>> >
>> > cheers
>> > Ramana
>> >
>> >
>> >
>> >
>> >
>> >
>
>
>

[-- Attachment #2: gcc-4.6-svn-167325-faraday-cpu-support.patch --]
[-- Type: application/octet-stream, Size: 46701 bytes --]

diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-cores.def	2010-12-09 08:41:56.060043000 +0800
@@ -74,6 +74,8 @@
 ARM_CORE("strongarm110",  strongarm110,	4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1100", strongarm1100, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
 ARM_CORE("strongarm1110", strongarm1110, 4,	             FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE("fa526",         fa526,        4,                               FL_LDSCHED, fastmul)
+ARM_CORE("fa626",         fa626,        4,                               FL_LDSCHED, fastmul)
 
 /* V4T Architecture Processors */
 ARM_CORE("arm7tdmi",      arm7tdmi,	4T,	FL_CO_PROC          , fastmul)
@@ -104,6 +106,10 @@
 ARM_CORE("xscale",        xscale,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale)
 ARM_CORE("iwmmxt",        iwmmxt,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
 ARM_CORE("iwmmxt2",       iwmmxt2,	5TE,	                         FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale)
+ARM_CORE("fa606te",       fa606te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa626te",       fa626te,      5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fmp626",        fmp626,       5TE,                             FL_LDSCHED, 9e)
+ARM_CORE("fa726te",       fa726te,      5TE,                             FL_LDSCHED, fa726te)
 
 /* V5TEJ Architecture Processors */
 ARM_CORE("arm926ej-s",    arm926ejs,	5TEJ,	                         FL_LDSCHED, 9e)
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm-tune.md	2010-12-08 22:27:17.548521000 +0800
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from arm-cores.def
 (define_attr "tune"
-	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
+	"arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.c	2010-12-02 09:20:29.903984000 +0800
@@ -241,6 +241,7 @@
 static rtx arm_pic_static_addr (rtx orig, rtx reg);
 static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
 static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
 static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
 static bool arm_class_likely_spilled_p (reg_class_t);
 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
@@ -882,6 +883,14 @@
   ARM_PREFETCH_BENEFICIAL(4,32,32)
 };
 
+const struct tune_params arm_fa726te_tune =
+{
+  arm_9e_rtx_costs,
+  fa726te_sched_adjust_cost,
+  1,
+  ARM_PREFETCH_NOT_BENEFICIAL
+};
+
 
 /* Not all of these give usefully different compilation alternatives,
    but there is no simple way of generalizing them.  */
@@ -7968,6 +7977,36 @@
   return true;
 }
 
+/* Adjust cost hook for FA726TE.  */
+static bool
+fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+  /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated)
+     have penalty of 3.  */
+  if (REG_NOTE_KIND (link) == REG_DEP_TRUE
+      && recog_memoized (insn) >= 0
+      && recog_memoized (dep) >= 0
+      && get_attr_conds (dep) == CONDS_SET)
+    {
+      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency.  */
+      if (get_attr_conds (insn) == CONDS_USE
+          && get_attr_type (insn) != TYPE_BRANCH)
+        {
+          *cost = 3;
+          return false;
+        }
+
+      if (GET_CODE (PATTERN (insn)) == COND_EXEC
+          || get_attr_conds (insn) == CONDS_USE)
+        {
+          *cost = 0;
+          return false;
+        }
+    }
+
+  return true;
+}
+
 /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
    It corrects the value of COST based on the relationship between
    INSN and DEP through the dependence LINK.  It returns the new
@@ -22779,6 +22818,7 @@
     case cortexa5:
     case cortexa8:
     case cortexa9:
+    case fa726te:
       return 2;
 
     default:
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/arm.md	2010-12-02 09:18:17.656513000 +0800
@@ -498,7 +498,7 @@
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
+          (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -516,6 +516,11 @@
 (include "arm1020e.md")
 (include "arm1026ejs.md")
 (include "arm1136jfs.md")
+(include "fa526.md")
+(include "fa606te.md")
+(include "fa626te.md")
+(include "fmp626.md")
+(include "fa726te.md")
 (include "cortex-a5.md")
 (include "cortex-a8.md")
 (include "cortex-a9.md")
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h	2010-12-13 14:27:29.954812000 +0800
@@ -52,7 +52,8 @@
 /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
 #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
 
-#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
+#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*\
+|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
 
 #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}"
 
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa526.md	2010-12-02 09:18:17.666513000 +0800
@@ -0,0 +1,161 @@
+;; Faraday FA526 Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
+
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+;;
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 3 (2 cycle penalty).
+;; ALU -> any use: latency = 2 (1 cycle penalty).
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA526 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa526_core" "fa526")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "526_alu_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu"))
+ "fa526_core")
+
+(define_insn_reservation "526_alu_shift_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa526_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "526_mult1" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy"))
+ "fa526_core")
+
+(define_insn_reservation "526_mult2" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\
+                       umlals,smulls,smlals,smlawx"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "526_load1_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load1,load_byte"))
+ "fa526_core")
+
+(define_insn_reservation "526_load2_op" 4
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_load3_op" 5
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_load4_op" 6
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "load4"))
+ "fa526_core*4")
+
+(define_insn_reservation "526_store1_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store1"))
+ "fa526_core")
+
+(define_insn_reservation "526_store2_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store2"))
+ "fa526_core*2")
+
+(define_insn_reservation "526_store3_op" 2
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store3"))
+ "fa526_core*3")
+
+(define_insn_reservation "526_store4_op" 3
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "store4"))
+ "fa526_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA526
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "526_branch_op" 0
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "branch"))
+ "fa526_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.  For most cases, the return value is set
+;; by a mov instruction, which has 1 cycle latency.
+(define_insn_reservation "526_call_op" 1
+ (and (eq_attr "tune" "fa526")
+      (eq_attr "type" "call"))
+ "fa526_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md	2010-12-13 14:23:52.555433000 +0800
@@ -0,0 +1,171 @@
+;; Faraday FA606TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA606TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; Modeled pipeline characteristics:
+;; LD -> any use: latency = 2 (1 cycle penalty).
+;; ALU -> any use: latency = 1 (0 cycle penalty).
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA606TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;      E      M      W
+
+(define_cpu_unit "fa606te_core" "fa606te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "606te_alu_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "alu,alu_shift,alu_shift_reg"))
+ "fa606te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "606te_mult1" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlalxy"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_mult2" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_mult3" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "mul,mla,muls,mlas"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_mult4" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals"))
+ "fa606te_core*4")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "606te_load1_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_load2_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_load3_op" 4
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_load4_op" 5
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "load4"))
+ "fa606te_core*4")
+
+(define_insn_reservation "606te_store1_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store1"))
+ "fa606te_core")
+
+(define_insn_reservation "606te_store2_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store2"))
+ "fa606te_core*2")
+
+(define_insn_reservation "606te_store3_op" 2
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store3"))
+ "fa606te_core*3")
+
+(define_insn_reservation "606te_store4_op" 3
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "store4"))
+ "fa606te_core*4")
+
+
+;;(define_insn_reservation "606te_ldm_op" 9
+;; (and (eq_attr "tune" "fa606te")
+;;      (eq_attr "type" "load2,load3,load4,store2,store3,store4"))
+;; "fa606te_core*7")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA606TE
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycles to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "606te_branch_op" 0
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "branch"))
+ "fa606te_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.  For most cases, the return value is set
+;; by a mov instruction, which has 1 cycle latency.
+(define_insn_reservation "606te_call_op" 1
+ (and (eq_attr "tune" "fa606te")
+      (eq_attr "type" "call"))
+ "fa606te_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa626te.md	2010-12-02 09:18:17.676517000 +0800
@@ -0,0 +1,165 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA626TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; Modeled pipeline characteristics:
+;; ALU -> simple address LDR/STR: latency = 2 (available after 2 cycles).
+;; ALU -> shifted address LDR/STR: latency = 3.
+;;		( extra 1 cycle unavoidable stall).
+;; ALU -> other use: latency = 2 (available after 2 cycles).
+;; LD  -> simple address LDR/STR: latency = 3 (available after 3 cycles).
+;; LD  -> shifted address LDR/STR: latency = 4
+;;		( extra 1 cycle unavoidable stall).
+;; LD  -> any other use: latency = 3 (available after 3 cycles).
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA626TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;      S      E      M      W
+
+(define_cpu_unit "fa626te_core" "fa626te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "626te_alu_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_alu_shift_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fa626te_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "626te_mult1" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult2" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "mul,mla"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_mult3" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_mult4" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "626te_load1_op" 3
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load1,load_byte"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_load2_op" 4
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load2,load3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_load3_op" 5
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "load4"))
+ "fa626te_core*3")
+
+(define_insn_reservation "626te_store1_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store1"))
+ "fa626te_core")
+
+(define_insn_reservation "626te_store2_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store2,store3"))
+ "fa626te_core*2")
+
+(define_insn_reservation "626te_store3_op" 2
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "store4"))
+ "fa626te_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA626TE
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "626te_branch_op" 0
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "branch"))
+ "fa626te_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value. 
+(define_insn_reservation "626te_call_op" 1
+ (and (eq_attr "tune" "fa626,fa626te")
+      (eq_attr "type" "call"))
+ "fa626te_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md	2010-12-13 14:21:42.050775000 +0800
@@ -0,0 +1,218 @@
+;; Faraday FA726TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; This automaton provides a pipeline description for the Faraday
+;; FA726TE core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fa726te")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+;;	E1	E2	E3	E4	E5	WB
+;;______________________________________________________
+;;
+;;      <-------------- LD/ST ----------->
+;;    shifter + LU      <-- AU -->
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
+;;______________________________________________________
+;;
+;;      <---------- MUL --------->
+;;    shifter + LU      <-- AU -->
+;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
+
+
+(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
+(define_cpu_unit "fa726te_mac_pipe" "fa726te")
+(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
+
+;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
+;; improve code quality.
+(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
+(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
+
+(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
+;; Reservation to restrict issue to 1.
+(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; Move instructions.
+(define_insn_reservation "726te_shift_op" 1
+  (and (eq_attr "tune" "fa726te")
+       (eq_attr "insn" "mov,mvn"))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with no shifted operand will finished in 1 cycle
+;; Other ALU instructions 2 cycles.
+(define_insn_reservation "726te_alu_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+;; ALU operations with a shift-by-register operand.
+;; These really stall in the decoder, in order to read the shift value
+;; in the first cycle.  If the instruction uses both shifter and AU,
+;; it takes 3 cycles.
+(define_insn_reservation "726te_alu_shift_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+
+(define_insn_reservation "726te_alu_shift_reg_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (and (eq_attr "type" "alu_shift_reg")
+           (not (eq_attr "insn" "mov,mvn"))))
+  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times.  Multiply operations occur in both the execute and memory
+;; stages of the pipeline
+
+(define_insn_reservation "726te_mult_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
+                       umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
+ "fa726te_issue+fa726te_mac_pipe")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+;; Loads with a shifted offset take 3 cycles, and are (a) probably the
+;; most common and (b) the pessimistic assumption will lead to fewer stalls.
+
+;; Scalar loads are pipelined in FA726TE LSU pipe.
+;; Here we model the resource conflict between Load@E3-stage & Store@W-stage.
+;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
+;; same "bundle", and the 2nd load will introudce another ISSUE stall but is
+;; still ok to execute (and may be benefical sometimes).
+
+(define_insn_reservation "726te_load1_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load1,load_byte"))
+ "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
+  | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
+
+(define_insn_reservation "726te_store1_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store1"))
+ "fa726te_blockage*2")
+
+;; Load/Store Multiple blocks all pipelines in EX stages until WB.
+;; No other instructions can be issued together.  Since they essentially
+;; prevent all scheduling opportunities, we model them together here.
+
+;; The LDM is breaking into multiple load instructions, later instruction in
+;; the pipe 1 is stalled.
+(define_insn_reservation "726te_ldm2_op" 4
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load2,load3"))
+ "fa726te_blockage*4")
+
+(define_insn_reservation "726te_ldm3_op" 5
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "load4"))
+ "fa726te_blockage*5")
+
+(define_insn_reservation "726te_stm2_op" 2
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store2,store3"))
+ "fa726te_blockage*3")
+
+(define_insn_reservation "726te_stm3_op" 3
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "store4"))
+ "fa726te_blockage*4")
+
+(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
+                  726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
+                 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
+                 "726te_shift_op,726te_alu_op")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
+                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
+(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
+                 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
+(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
+
+(define_bypass 4 "726te_load1_op" "726te_mult_op")
+(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
+(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FA726TE
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "726te_branch_op" 0
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "branch"))
+ "fa726te_blockage")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 is ready for int return value.
+(define_insn_reservation "726te_call_op" 1
+ (and (eq_attr "tune" "fa726te")
+      (eq_attr "type" "call"))
+ "fa726te_blockage")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md	1970-01-01 08:00:00.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md	2010-12-13 14:25:26.035559000 +0800
@@ -0,0 +1,182 @@
+;; Faraday FA626TE Pipeline Description
+;; Copyright (C) 2010 Free Software Foundation, Inc.
+;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; These descriptions are based on the information contained in the
+;; FMP626 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
+
+;; Pipeline architecture
+;;	S	E	M	W(Q1)	Q2
+;;   ___________________________________________
+;;    shifter alu
+;;    mul1    mul2    mul3
+;;    ld/st1  ld/st2  ld/st3  ld/st4  ld/st5
+
+;; This automaton provides a pipeline description for the Faraday
+;; FMP626 core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;;   The ALU pipeline has fetch, decode, execute, memory, and
+;;   write stages.  We only need to model the execute, memory and write
+;;   stages.
+
+(define_cpu_unit "fmp626_core" "fmp626")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require two cycles to execute, and use the ALU
+;; pipeline in each of the three stages.  The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles.  That case is not modeled here.
+
+;; ALU operations
+(define_insn_reservation "mp626_alu_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_alu_shift_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "alu_shift,alu_shift_reg"))
+ "fmp626_core")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_reservation "mp626_mult1" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult2" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "mul,mla"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_mult3" 3
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_mult4" 4
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "insn" "smulls,smlals,umulls,umlals"))
+ "fmp626_core*3")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!").  These models assume that all memory references
+;; hit in dcache.
+
+(define_insn_reservation "mp626_load1_op" 5
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load1,load_byte"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_load2_op" 6
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load2,load3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_load3_op" 7
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "load4"))
+ "fmp626_core*3")
+
+(define_insn_reservation "mp626_store1_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store1"))
+ "fmp626_core")
+
+(define_insn_reservation "mp626_store2_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store2,store3"))
+ "fmp626_core*2")
+
+(define_insn_reservation "mp626_store3_op" 2
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "store4"))
+ "fmp626_core*3")
+
+(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op"
+                 "mp626_store1_op,mp626_store2_op,mp626_store3_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\
+                  mp626_mult3,mp626_mult4" "mp626_store1_op"
+                 "arm_no_early_store_addr_dep")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op")
+(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op"
+                 "arm_no_early_alu_shift_dep")
+(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op")
+(define_bypass 2 "mp626_mult3" "mp626_alu_op")
+(define_bypass 3 "mp626_mult4" "mp626_alu_op")
+(define_bypass 4 "mp626_load1_op" "mp626_alu_op")
+(define_bypass 5 "mp626_load2_op" "mp626_alu_op")
+(define_bypass 6 "mp626_load3_op" "mp626_alu_op")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately.  The FMP626
+;; core can predict most branches.  If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream.  Some branches can
+;; therefore appear to require zero cycle to execute.  We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "mp626_branch_op" 0
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "branch"))
+ "fmp626_core")
+
+;; The latency for a call is actually the latency when the result is available.
+;; i.e. R0 ready for int return value.
+(define_insn_reservation "mp626_call_op" 1
+ (and (eq_attr "tune" "fmp626")
+      (eq_attr "type" "call"))
+ "fmp626_core")
+
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm	2010-12-02 09:18:17.689525000 +0800
@@ -24,6 +24,11 @@
 		$(srcdir)/config/arm/arm1020e.md \
 		$(srcdir)/config/arm/arm1026ejs.md \
 		$(srcdir)/config/arm/arm1136jfs.md \
+		$(srcdir)/config/arm/fa526.md \
+		$(srcdir)/config/arm/fa606te.md \
+		$(srcdir)/config/arm/fa626te.md \
+		$(srcdir)/config/arm/fmp626.md \
+		$(srcdir)/config/arm/fa726te.md \
 		$(srcdir)/config/arm/arm926ejs.md \
 		$(srcdir)/config/arm/cirrus.md \
 		$(srcdir)/config/arm/fpa.md \
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/t-arm-elf	2010-12-13 16:32:49.345191000 +0800
@@ -36,6 +36,10 @@
 MULTILIB_EXCEPTIONS  = 
 MULTILIB_MATCHES     =
 
+#MULTILIB_OPTIONS     += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa526 fa626 fa606te fa626te fmp626 fa726te
+#MULTILIB_EXCEPTIONS  += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626
+
 #MULTILIB_OPTIONS      += march=armv7
 #MULTILIB_DIRNAMES     += thumb2
 #MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
@@ -52,6 +56,8 @@
 MULTILIB_OPTIONS       += mfloat-abi=hard
 MULTILIB_DIRNAMES      += fpu
 MULTILIB_EXCEPTIONS    += *mthumb/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa526/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS    += *mcpu=fa626/*mfloat-abi=hard*
 
 # MULTILIB_OPTIONS    += mcpu=ep9312
 # MULTILIB_DIRNAMES   += ep9312
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi
--- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi	2010-12-01 18:48:48.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/config/arm/t-linux-eabi	2010-12-02 09:18:17.697515000 +0800
@@ -24,6 +24,10 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
+#MULTILIB_OPTIONS     += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te
+#MULTILIB_DIRNAMES    += fa606te fa626te fmp626 fa726te
+#MULTILIB_EXCEPTIONS  += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te*
+
 # Use a version of div0 which raises SIGFPE, and a special __clear_cache.
 LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache
 
diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi
--- tmp/gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi	2010-12-01 18:34:13.000000000 +0800
+++ gcc-4.6-svn-167325-20101201/gcc/doc/invoke.texi	2010-12-02 09:18:17.785515000 +0800
@@ -10110,7 +10110,9 @@
 @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-m4}, @samp{cortex-m3},
 @samp{cortex-m1},
 @samp{cortex-m0},
-@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
+@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312},
+@samp{fa526}, @samp{fa626},
+@samp{fa606te}, @samp{fa626te}, @samp{fmp626}, @samp{fa726te}.
 
 @item -mtune=@var{name}
 @opindex mtune

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][4.6][ARM] New CPU support for Faraday cores
  2010-12-13 11:01               ` M.F. Wu
@ 2010-12-20 18:48                 ` Richard Earnshaw
  0 siblings, 0 replies; 11+ messages in thread
From: Richard Earnshaw @ 2010-12-20 18:48 UTC (permalink / raw)
  To: M.F. Wu
  Cc: ramana.radhakrishnan, Joseph S. Myers, gcc-patches, toolchain,
	Matthew.Gretton-Dann, Leon.Chen

On Mon, 2010-12-13 at 16:49 +0800, M.F. Wu wrote:
> Dear Ramana,
> 
> Thank you for your kindly help. I have modified according to
> your suggestion. The following is my Changlog.
> 
> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
> 	    Mingfeng Wu  <mingfeng@faraday-tech.com>
> 
> 	* config/arm/arm-cores.def: Add Faraday CPU support -
> 	fa526,fa626,fa606te,fa626te,fmp626,fa726te.
> 	* config/arm/arm-tune.md: Regenerate.
> 	* config/arm/arm.c (arm_fa726te_tune): New.
> 	(fa726te_sched_adjust_cost): New.
> 	(arm_issue_rate): Handle fa726te.
> 	* config/arm/arm.md (generic_sched): Don't use Generic scheduler for
> 	Faraday cores.
> 	* config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Handle fa526 and fa626.
> 	* config/arm/t-arm (MD_INCLUDES): Include machine description files for
> 	Faraday cores.
> 	* config/arm/t-arm-elf: Add multilib option for Faraday cores.
> 	* config/arm/t-linux-eabi: Add multilib option for Faraday cores except
> 	fa526 and fa626.
> 	* doc/invoke.texi: Document -mcpu for Faraday cores.
> 	* config/arm/fa526.md: New file.
> 	* config/arm/fa626.md: New file.
> 	* config/arm/fa606te.md: New file.
> 	* config/arm/fa626te.md: New file.
> 	* config/arm/fmp626.md: New file.
> 	* config/arm/fa726te.md: New file.

This is OK, and I've now committed it.  

>	* config/arm/fa626.md: New file.

This appears to be missing, but since nothing in the patch references it
I've assumed this is just a typo in the ChangeLog entry.

R.

> 
> 
> 
> 
> 2010/12/9 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> > Hi Mingfeng
> >
> > Sorry about the late response and thanks for working through the issues.
> > I've been off sick and only got back to looking at this today.
> >
> > Some minor nits in your changelog.
> >
> >
> >> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
> >>         Mingfeng Wu  <mingfeng@faraday-tech.com>
> >>
> >>       * config/arm/arm-cores.def: Add Faraday CPU support -
> >>       fa526/fa626/fa606te/fa626te/fmp626/fa726te.
> >
> > ',' instead of '/' in the changelog entry. fa526, fa626 etc.
> >
> >>       * config/arm/arm-tune.md: Regenerate.
> >>       * config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te
> >
> > It's enough to say New.
> >
> 
> OK.
> 
> >>       (fa726te_sched_adjust_cost): New cost function for fa726te.
> >
> > Enough to say New.
> >
> 
> OK.
> 
> >>       (arm_issue_rate): Add fa726te.
> >
> > s/Add/Handle
> >
> 
> OK.
> 
> >>       * config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
> >>       and include machine description files.
> >
> > Replace sentence with :
> > Don't use Generic scheduler for Faraday cores.
> >
> 
> Replaced.
> 
> >
> >>       * config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
> >
> > s/Add/Handle
> >
> 
> OK.
> 
> >
> >> >
> >> >> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w"
> >> "fa726te")
> >> >
> >> > You have a query_cpu_unit which you don't seem to be querying for in
> >> the backend in any form? Is there any thing else
> >> > missing in your pipeline description or has this been put in for
> >> future use ?
> >> >
> >>
> >> I only use the units defined by query_cpu_unit in the fa726te.md. The
> >> two units, fa726te_lsu1_pipe_e and
> >> fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
> >> Because fa726te only supports one
> >> ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
> >> for better load instruction scheduling.
> >
> > The reason I asked why you used the define_cpu_unit vs
> > define_query_cpu_unit was because there was no backend hook that queried
> > for the cpu unit in question. IIRC define_cpu_unit and
> > define_query_cpu_unit are more or less identical except for the fact
> > that you can result in better minimization in one case than the other.
> >
> > However in this case the difference in the size of the automaton should
> > be minimal considering it isn't a very big automaton in question.
> >
> >>
> >> >> +;; reservation which blocks IS
> >> >> +(define_reservation "fa726te_blockage" "(fa726te_is0
> >> +fa726te_is1)")
> >> >
> >> > Can you clarify the comment above ? Again the comments about
> >> sentence case and full stops hold.
> >
> >>
> >>
> >> It is used to restrict the instruction issue to one.
> >
> >
> > Ok so make that explicit in the comment. Something like:
> >
> > " ;Reservation to restrict issue to 1.
> >
> 
> Modified as suggested.
> 
> >
> >
> > Now on to your latest patch submission.
> >
> >
> >
> > Still a few formatting issues from your latest patch.
> >
> >
> >> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
> >> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md 1970-01-01 08:00:00.000000000 +0800
> >> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md     2010-12-02 09:18:17.671515000 +0800
> >>
> > <...>
> >
> >>
> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> +;; Branch and Call Instructions
> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> +
> >> +;; Branch instructions are difficult to model accurately.  The FA606TE
> >> +;; core can predict most branches.  If the branch is predicted
> >> +;; correctly, and predicted early enough, the branch can be completely
> >> +;; eliminated from the instruction stream.  Some branches can
> >> +;; therefore appear to require zero cycles to execute.  We assume that
> >> +;; all branches are predicted correctly, and that the latency is
> >> +;; therefore the minimum value.
> >> +
> >> +(define_insn_reservation "606te_branch_op" 0
> >> + (and (eq_attr "tune" "fa606te")
> >> +      (eq_attr "type" "branch"))
> >> + "fa606te_core")
> >> +
> >> +;; The latency for a call is actually the latency when the result being available.
> >> +;; i.e. R0 ready for int return value. For most cases, the return value is set by a
> >                                        ^^^
> >                                        2 spaces between '.' and start of next sentence.
> >
> 
> OK.
> 
> >> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md
> >> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md 1970-01-01 08:00:00.000000000 +0800
> >> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa606te.md     2010-12-02 09:18:17.671515000 +0800
> >> @@ -0,0 +1,171 @@
> >>
> >> +;; The latency for a call is actually the latency when the result is available.
> >> +;; i.e. R0 ready for int return value.
> >                                         ^
> >
> > Very small nit. Remove trailing white-space.
> >
> 
> OK.
> 
> >>
> >> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md
> >> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md 1970-01-01 08:00:00.000000000 +0800
> >> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fa726te.md     2010-12-02 14:45:23.731365000 +0800
> >
> > <snip...>
> >
> >>
> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> +;; Pipelines
> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> +
> >> +;;   The ALU pipeline has fetch, decode, execute, memory, and
> >> +;;   write stages.  We only need to model the execute, memory and write
> >> +;;   stages.
> >> +
> >> +;;   E1      E2      E3      E4      E5      WB
> >> +;;______________________________________________________
> >> +;;
> >> +;;      <-------------- LD/ST ----------->
> >> +;;    shifter + LU      <-- AU -->
> >                                     ^ Trailing whitespace.
> >
> 
> OK.
> 
> >> +;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
> >> +;;______________________________________________________
> >> +;;
> >> +;;      <---------- MUL --------->
> >> +;;    shifter + LU      <-- AU -->
> >                                     ^ Trailing whitespace.
> >
> 
> OK.
> 
> >
> > >From fa726te.md
> >
> >> +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
> >                                                                             ^
> > Trailing whitespace.
> >
> >> +                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
> >> +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
> >                                                                             ^
> > Trailing whitespace.
> >
> 
> OK.
> 
> >
> >> +;; The latency for a call is actually the latency when the result is available.
> >> +;; i.e. R0 is ready for int return value.
> >
> > Likewise.
> >
> 
> OK.
> 
> >
> >>
> >> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md
> >> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md  1970-01-01 08:00:00.000000000 +0800
> >> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/fmp626.md      2010-12-02 09:18:17.687514000 +0800
> > <snip>
> >
> >> +;; Pipeline architecture
> >> +;;   S       E       M       W(Q1)   Q2
> >> +;;   ___________________________________________
> >> +;;    shifter alu
> >                    ^^^ Multiple trailing whitespaces.
> >
> 
> OK.
> 
> >
> >
> >> diff -uNr tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h
> >> --- tmp/gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h    2010-12-01 18:48:48.000000000 +0800
> >> +++ gcc-4.6-svn-167325-20101201/gcc/config/arm/bpabi.h        2010-12-02 09:18:17.660518000 +0800
> >> @@ -52,7 +52,7 @@
> >>  /* The BPABI integer comparison routines return { -1, 0, 1 }.  */
> >>  #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
> >>
> >> -#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
> >> +#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
> >>
> >
> > Exceeds the 80 character per line limit.  Can you split this across multiple lines ?
> >
> > Something like
> > #define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*\
> > |march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}"
> >
> > should do the trick.
> >
> 
> OK. Split it into two lines.
> 
> > However this isn't an approval since I can't approve or reject your patch.
> >
> >
> > cheers
> > Ramana
> >
> >
> >
> > On Thu, 2010-12-02 at 16:27 +0800, M.F. Wu wrote:
> >> Dear Ramana,
> >>
> >> Thank you for your comments about the patch.
> >> The patch has been modified as the attached
> >> file shows.
> >>
> >> The Changlog:
> >>
> >> 2010-12-02  Sanjin Liu  <scliu@faraday-tech.com>
> >>         Mingfeng Wu  <mingfeng@faraday-tech.com>
> >>
> >>       * config/arm/arm-cores.def: Add Faraday CPU support -
> >>       fa526/fa626/fa606te/fa626te/fmp626/fa726te.
> >>       * config/arm/arm-tune.md: Regenerate.
> >>       * config/arm/arm.c (arm_fa726te_tune): New tune_params for fa726te
> >>       (fa726te_sched_adjust_cost): New cost function for fa726te.
> >>       (arm_issue_rate): Add fa726te.
> >>       * config/arm/arm.md (generic_sched): Add Faraday cores to generic_sched
> >>       and include machine description files.
> >>       * config/arm/bpabi.h (TARGET_FIX_V4BX_SPEC): Add fa526 and fa626.
> >>       * config/arm/t-arm (MD_INCLUDES): Include machine description files for
> >>       Faraday cores.
> >>       * config/arm/t-arm-elf: Add multilib option for Faraday cores.
> >>       * config/arm/t-linux-eabi: Add multilib option for Faraday cores except
> >>       fa526 and fa626.
> >>       * doc/invoke.texi: Document -mcpu for Faraday cores.
> >>       * config/arm/fa526.md: New file.
> >>       * config/arm/fa626.md: New file.
> >>       * config/arm/fa606te.md: New file.
> >>       * config/arm/fa626te.md: New file.
> >>       * config/arm/fmp626.md: New file.
> >>       * config/arm/fa726te.md: New file.
> >>
> >>
> >>
> >> 2010/11/30 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> >> > Hi Mingfeng,
> >> >
> >> > Thanks for making these changes.
> >> >
> >> > Please do not make the Changelog a part of the final patch. Please make
> >> > this a part of your final mail submission and not a part of your patch.
> >> >
> >> >
> >> >> @@ -7913,6 +7921,36 @@
> >> >
> >> > <...>
> >> >
> >> >> +      /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */
> >> >
> >> > Full stop at the end of comment followed by 2 spaces before end of comment.
> >> > Can you please audit your patch to check for these issues ?
> >> >
> >> > This is true for comments in the machine description parts of your patch as well.
> >> >
> >>
> >> Fixed.
> >>
> >> >
> >> >> +      if (get_attr_conds(insn)  == CONDS_USE &&
> >> >> +          get_attr_type(insn) != TYPE_BRANCH)
> >> >> +        {
> >> >> +          *cost = 3;
> >> >> +          return false;
> >> >> +        }
> >> >
> >> > Space between function name and paranthesis. Thus it should be
> >> > get_attr_conds (insn) and not get_attr_conds(insn) as above.
> >> >
> >> >> +
> >> >> +      if (GET_CODE (PATTERN (insn)) == COND_EXEC
> >> >> +          || get_attr_conds(insn)  == CONDS_USE)
> >> >> +        {
> >> >> +          *cost = 0;
> >> >> +          return false;
> >> >> +        }
> >> >> +    }
> >> >
> >> > Likewise.
> >> >
> >>
> >> Fixed.
> >>
> >> >
> >> >> diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> >> >> >> gcc-4.6-svn-20101116/gcc/config/arm/fa526.md
> >> >> >> > --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md  1970-01-01
> >> >> >> 08:00:00.000000000 +0800
> >> >> >> > +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md      2010-11-23
> >> >> >> 14:36:17.916371000 +0800
> >> >>
> >> >> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> >> >> > +;; Branch and Call Instructions
> >> >> >> >
> >> >> >> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >> >> >> > +
> >> >
> >> >> > And equivalently in all the other pipeline descriptions.
> >> >> >
> >> >>
> >> >> Sorry, I don't understand exactly what you mean...
> >> >
> >> >
> >> >> +;; Branch instructions are difficult to model accurately.  The ARM
> >> >> +;; core can predict most branches.  If the branch is predicted
> >> >>
> >> > In the sentence above -
> >> > Replace ARM with FA526 and similarly in the other pipeline descriptions.
> >> >
> >> > The FA526 core isn't one made by ARM and am not sure if you can use the name
> >> > in that regard here :)
> >> >
> >>
> >> OK, you are right. I have modified the "ARM" to our core.
> >>
> >> >> > ;
> >> >> >
> >> >> > +const struct tune_params arm_fa726te_tune =
> >> >> > +{
> >> >> > +  arm_9e_rtx_costs,
> >> >> > +  fa726te_sched_adjust_cost,
> >> >> > +  1
> >> >> > +};
> >> >> > +
> >> >
> >> > This part of your patch is now out-of-date thanks to Ian Bolton's latest commits in that area with respect
> >> > to preloads. You might want to consider that in your final submission. I suppose using the defaults
> >> > and turning off preloads at O3 would be the correct thing to do to get your patch sheperded through.
> >> >
> >>
> >> OK. I modified the arm_fa726te_tune.
> >>
> >> const struct tune_params arm_fa726te_tune =
> >> {
> >>   arm_9e_rtx_costs,
> >>   fa726te_sched_adjust_cost,
> >>   1,
> >>   ARM_PREFETCH_NOT_BENEFICIAL
> >> };
> >>
> >>
> >> >>
> >> >> --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md        1970-01-01 08:00:00.000000000 +0800
> >> >> +++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md    2010-11-25 17:06:01.877554000 +0800
> >> >> @@ -0,0 +1,221 @@
> >> >>
> >> >> +(define_automaton "fa726te")
> >> >> +(automata_option "ndfa")
> >> >> +
> >> >
> >> > Why do you have an ndfa option here? Does this give you benefit with benchmarking on the FA726te core since this usually increases compile time
> >> > as the automaton ends up searching for all possible options ?
> >> >
> >>
> >> Yes. the ndfa option does benefit our benchmarking, but a little. So I
> >> remove the ndfa option here.
> >>
> >> >
> >> >> +;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
> >> >> +;; improve code quality
> >> >
> >> > Full stop at the end of the comment. pretend should start with a capital P and not lower case. (Pretend)
> >> > 2 spaces between the a full stop or a punctuation character that terminates a sentence and the start of
> >> > the next sentence. There are a number of places in your patch where one can see such cases.
> >> >
> >>
> >> Fixed.
> >>
> >> >
> >> >> +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
> >> >
> >> > You have a query_cpu_unit which you don't seem to be querying for in the backend in any form? Is there any thing else
> >> > missing in your pipeline description or has this been put in for future use ?
> >> >
> >>
> >> I only use the units defined by query_cpu_unit in the fa726te.md. The
> >> two units, fa726te_lsu1_pipe_e and
> >> fa726te_lsu1_pipe_w, are only used for arrange the load instructions.
> >> Because fa726te only supports one
> >> ldr/str pipe, I use the query_cpu_unit to define another pseudo pipe
> >> for better load instruction scheduling.
> >>
> >> >> +;; reservation which blocks IS
> >> >> +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
> >> >
> >> > Can you clarify the comment above ? Again the comments about sentence case and full stops hold.
> >> >
> >>
> >> It is used to restrict the instruction issue to one.
> >>
> >> >
> >> >
> >> > cheers
> >> > Ramana
> >> >
> >> >
> >> >
> >> >
> >> >
> >> >
> >
> >
> >



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-12-20 17:52 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-22  7:55 [PATCH][4.6][ARM] New CPU support for Faraday cores M.F. Wu
2010-11-22 18:36 ` Joseph S. Myers
2010-11-23 10:11   ` M.F. Wu
2010-11-24 10:17     ` Ramana Radhakrishnan
2010-11-25 11:31       ` M.F. Wu
2010-11-30 14:51         ` Ramana Radhakrishnan
2010-12-02  8:27           ` M.F. Wu
2010-12-09 14:49             ` Ramana Radhakrishnan
2010-12-13 11:01               ` M.F. Wu
2010-12-20 18:48                 ` Richard Earnshaw
2010-11-24 12:20     ` Ramana Radhakrishnan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).