public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH x86] Add march/mtune=knl
@ 2014-12-10 16:21 Ilya Tocar
  2014-12-10 16:35 ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Ilya Tocar @ 2014-12-10 16:21 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches

Hi,

Patch bellow adds march/mtune/attribute=knl.
For now this is just silvermont tuning and avx/avx2/avx512 support.
Ok for trunk?

gcc/
	* config.gcc: Support "knl".
	* config/i386/driver-i386.c (host_detect_local_cpu): Detect "knl".
	* config/i386/i386-c.c (ix86_target_macros_internal): Handle
	PROCESSOR_KNL.
	* config/i386/i386.c (m_KNL): Define.
	(processor_target_table): Add "knl".
	(PTA_KNL): Define.
	(ix86_issue_rate): Add PROCESSOR_KNL.
	(ix86_adjust_cost): Ditto.
	(ia32_multipass_dfa_lookahead): Ditto.
	(get_builtin_code_for_version): Handle "knl".
	(fold_builtin_cpu): Ditto.
	* config/i386/i386.h (TARGET_KNL): Define.
	(processor_type): Add PROCESSOR_KNL.
	* config/i386/i386.md (attr "cpu"): Add knl.
	* config/i386/x86-tune.def: Add m_KNL.

gcc/testsuite/
	* gcc.target/i386/funcspec-5.c: Test avx512f and knl.

---
 gcc/config.gcc                             |  3 +-
 gcc/config/i386/driver-i386.c              |  6 +++-
 gcc/config/i386/i386-c.c                   |  7 +++++
 gcc/config/i386/i386.c                     | 17 ++++++++++-
 gcc/config/i386/i386.h                     |  2 ++
 gcc/config/i386/i386.md                    |  2 +-
 gcc/config/i386/x86-tune.def               | 47 +++++++++++++++---------------
 gcc/testsuite/gcc.target/i386/funcspec-5.c |  3 ++
 8 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index fa3e1fc..8541274 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -591,7 +591,8 @@ pentium4 pentium4m pentiumpro prescott"
 x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
 bdver3 bdver4 btver1 btver2 k8 k8-sse3 opteron opteron-sse3 nocona \
 core2 corei7 corei7-avx core-avx-i core-avx2 atom slm nehalem westmere \
-sandybridge ivybridge haswell broadwell bonnell silvermont x86-64 native"
+sandybridge ivybridge haswell broadwell bonnell silvermont knl x86-64 \
+native"
 
 # Additional x86 processors supported by --with-cpu=.  Each processor
 # MUST be separated by exactly one space.
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index a2248ce..69ebebd 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -747,7 +747,11 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 	  if (arch)
 	    {
 	      /* This is unknown family 0x6 CPU.  */
-	      if (has_adx)
+	      /* Assume Knl.  */
+	      if (has_avx512f)
+		cpu = "knl";
+	      /* Assume Broadwell.  */
+	      else if (has_adx)
 		cpu = "broadwell";
 	      else if (has_avx2)
 		/* Assume Haswell.  */
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 3ad7d49..1c604fc3 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -171,6 +171,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
       def_or_undef (parse_in, "__silvermont");
       def_or_undef (parse_in, "__silvermont__");
       break;
+    case PROCESSOR_KNL:
+      def_or_undef (parse_in, "__knl");
+      def_or_undef (parse_in, "__knl__");
+      break;
     /* use PROCESSOR_max to not set/unset the arch macro.  */
     case PROCESSOR_max:
       break;
@@ -277,6 +281,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
       def_or_undef (parse_in, "__tune_slm__");
       def_or_undef (parse_in, "__tune_silvermont__");
       break;
+    case PROCESSOR_KNL:
+      def_or_undef (parse_in, "__tune_knl__");
+      break;
     case PROCESSOR_INTEL:
     case PROCESSOR_GENERIC:
       break;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1e1716e..f0cbe48 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2040,6 +2040,7 @@ const struct processor_costs *ix86_cost = &pentium_cost;
 #define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_HASWELL)
 #define m_BONNELL (1<<PROCESSOR_BONNELL)
 #define m_SILVERMONT (1<<PROCESSOR_SILVERMONT)
+#define m_KNL (1<<PROCESSOR_KNL)
 #define m_INTEL (1<<PROCESSOR_INTEL)
 
 #define m_GEODE (1<<PROCESSOR_GEODE)
@@ -2505,6 +2506,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
   {"haswell", &core_cost, 16, 10, 16, 10, 16},
   {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
   {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
+  {"knl", &slm_cost, 16, 15, 16, 7, 16},
   {"intel", &intel_cost, 16, 15, 16, 7, 16},
   {"geode", &geode_cost, 0, 0, 0, 0, 0},
   {"k6", &k6_cost, 32, 7, 32, 7, 32},
@@ -3178,6 +3180,8 @@ ix86_option_override_internal (bool main_args_p,
    | PTA_FMA | PTA_MOVBE | PTA_HLE)
 #define PTA_BROADWELL \
   (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
+#define PTA_KNL \
+  (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
 #define PTA_BONNELL \
   (PTA_CORE2 | PTA_MOVBE)
 #define PTA_SILVERMONT \
@@ -3241,6 +3245,7 @@ ix86_option_override_internal (bool main_args_p,
       {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
       {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
       {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
+      {"knl", PROCESSOR_KNL, CPU_KNL, PTA_KNL},
       {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
       {"geode", PROCESSOR_GEODE, CPU_GEODE,
 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
@@ -25934,6 +25939,7 @@ ix86_issue_rate (void)
     case PROCESSOR_PENTIUM:
     case PROCESSOR_BONNELL:
     case PROCESSOR_SILVERMONT:
+    case PROCESSOR_KNL:
     case PROCESSOR_INTEL:
     case PROCESSOR_K6:
     case PROCESSOR_BTVER2:
@@ -26276,6 +26282,7 @@ ix86_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
       break;
 
     case PROCESSOR_SILVERMONT:
+    case PROCESSOR_KNL:
     case PROCESSOR_INTEL:
       if (!reload_completed)
 	return cost;
@@ -26345,6 +26352,7 @@ ia32_multipass_dfa_lookahead (void)
     case PROCESSOR_HASWELL:
     case PROCESSOR_BONNELL:
     case PROCESSOR_SILVERMONT:
+    case PROCESSOR_KNL:
     case PROCESSOR_INTEL:
       /* Generally, we want haifa-sched:max_issue() to look ahead as far
 	 as many instructions can be executed on a cycle, i.e.,
@@ -34246,7 +34254,8 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
     P_PROC_FMA,
     P_AVX2,
     P_PROC_AVX2,
-    P_AVX512F
+    P_AVX512F,
+    P_PROC_AVX512F
   };
 
  enum feature_priority priority = P_ZERO;
@@ -34350,6 +34359,10 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
 	      arg_str = "bonnell";
 	      priority = P_PROC_SSSE3;
 	      break;
+	    case PROCESSOR_KNL:
+	      arg_str = "knl";
+	      priority = P_PROC_AVX512F;
+	      break;
 	    case PROCESSOR_SILVERMONT:
 	      arg_str = "silvermont";
 	      priority = P_PROC_SSE4_2;
@@ -35268,6 +35281,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
     M_AMDFAM10H,
     M_AMDFAM15H,
     M_INTEL_SILVERMONT,
+    M_INTEL_KNL,
     M_AMD_BTVER1,
     M_AMD_BTVER2,    
     M_CPU_SUBTYPE_START,
@@ -35305,6 +35319,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
       {"haswell", M_INTEL_COREI7_HASWELL},
       {"bonnell", M_INTEL_BONNELL},
       {"silvermont", M_INTEL_SILVERMONT},
+      {"knl", M_INTEL_KNL},
       {"amdfam10h", M_AMDFAM10H},
       {"barcelona", M_AMDFAM10H_BARCELONA},
       {"shanghai", M_AMDFAM10H_SHANGHAI},
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index df7789d..7c35758 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -337,6 +337,7 @@ extern const struct processor_costs ix86_size_cost;
 #define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL)
 #define TARGET_BONNELL (ix86_tune == PROCESSOR_BONNELL)
 #define TARGET_SILVERMONT (ix86_tune == PROCESSOR_SILVERMONT)
+#define TARGET_KNL (ix86_tune == PROCESSOR_KNL)
 #define TARGET_INTEL (ix86_tune == PROCESSOR_INTEL)
 #define TARGET_GENERIC (ix86_tune == PROCESSOR_GENERIC)
 #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
@@ -2272,6 +2273,7 @@ enum processor_type
   PROCESSOR_HASWELL,
   PROCESSOR_BONNELL,
   PROCESSOR_SILVERMONT,
+  PROCESSOR_KNL,
   PROCESSOR_INTEL,
   PROCESSOR_GEODE,
   PROCESSOR_K6,
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9019ed8..7ae511c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -399,7 +399,7 @@
 ;; Processor type.
 (define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
 		    atom,slm,generic,amdfam10,bdver1,bdver2,bdver3,bdver4,
-		    btver2"
+		    btver2,knl"
   (const (symbol_ref "ix86_schedule")))
 
 ;; A basic instruction type.  Refinements due to arguments to be
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index b5c6e4f..db43b3d 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -41,7 +41,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
           m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 
-	  | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+	  | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
 
 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
    on modern chips.  Preffer stores affecting whole integer register
@@ -49,7 +49,7 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
    value over movb.  */
 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
-	  | m_AMD_MULTIPLE | m_GENERIC)
+	  | m_KNL | m_AMD_MULTIPLE | m_GENERIC)
 
 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
    destinations to be 128bit to allow register renaming on 128bit SSE units,
@@ -85,13 +85,13 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
    partial dependencies.  */
 DEF_TUNE (X86_TUNE_MOVX, "movx",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	  | m_INTEL | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC)
+	  | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC)
 
 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
    full sized loads.  */
 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
-	  | m_AMD_MULTIPLE | m_GENERIC)
+	  | m_KNL | m_AMD_MULTIPLE | m_GENERIC)
 
 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
    conditional jump instruction for 32 bit TARGET.
@@ -125,7 +125,7 @@ DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
    during reassociation of fp computation.  */
 DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
-          m_BONNELL | m_SILVERMONT | m_HASWELL | m_INTEL | m_BDVER1
+          m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL |m_INTEL | m_BDVER1
 	  | m_BDVER2 | m_GENERIC)
 
 /*****************************************************************************/
@@ -145,7 +145,7 @@ DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
    regression on mgrid due to IRA limitation leading to unecessary
    use of the frame pointer in 32bit mode.  */
 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
-	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
+	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
 	  | m_ATHLON_K8)
 
 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
@@ -205,7 +205,7 @@ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
    than 4 branch instructions in the 16 byte window.  */
 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
-          m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL |
+          m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL |m_INTEL |
 	  m_ATHLON_K8 | m_AMDFAM10)
 
 /*****************************************************************************/
@@ -229,21 +229,22 @@ DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.   */
 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
           ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
-	    | m_GENERIC))
+	   |  m_KNL | m_GENERIC))
 
 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
    for DFmode copies */
 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	    | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
+	    | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
 
 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
    will impact LEA instruction selection. */
-DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_INTEL)
+DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
+	  | m_INTEL)
 
 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
-	  m_BONNELL | m_SILVERMONT)
+	  m_BONNELL | m_SILVERMONT | m_KNL)
 
 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
    vector path on AMD machines.
@@ -260,7 +261,7 @@ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
    a conditional move.  */
 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
-	  m_BONNELL | m_SILVERMONT | m_INTEL)
+	  m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL)
 
 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
@@ -278,17 +279,17 @@ DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	  | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
-	  | m_GENERIC)
+	  | m_KNL | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
+	  | m_BTVER | m_GENERIC)
 
 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
-	  ~(m_PENT | m_BONNELL | m_SILVERMONT | m_INTEL  | m_K6))
+	  ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL  | m_K6))
 
 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
-          m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE
-	  | m_GENERIC)
+          m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
+	  | m_AMD_MULTIPLE | m_GENERIC)
 
 /*****************************************************************************/
 /* 387 instruction selection tuning                                          */
@@ -304,7 +305,7 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
    integer operand.  */
 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
           ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	    | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
+	    | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
 
 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
@@ -312,7 +313,7 @@ DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	  | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
+	  | m_KNL | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
 
 /*****************************************************************************/
 /* SSE instruction selection tuning                                          */
@@ -331,13 +332,13 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
    of a sequence loading registers by parts.  */
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
           m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER
-	  | m_BTVER | m_SILVERMONT | m_INTEL | m_GENERIC)
+	  | m_BTVER | m_SILVERMONT | m_KNL | m_INTEL | m_GENERIC)
 
 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
    of a sequence loading registers by parts.  */
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
           m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT
-	  | m_INTEL | m_GENERIC)
+	  | m_KNL | m_INTEL | m_GENERIC)
 
 /* Use packed single precision instructions where posisble.  I.e. movups instead
    of movupd.  */
@@ -374,7 +375,7 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
    fp converts to destination register.  */
 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
-          m_SILVERMONT | m_INTEL)
+          m_SILVERMONT | m_KNL | m_INTEL)
 
 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
    from FP to FP.  This form of instructions avoids partial write to the
@@ -388,7 +389,7 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 
 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
-          m_BONNELL | m_SILVERMONT | m_INTEL)
+          m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL)
 
 /* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
    execute 2 or more vector instructions in parallel.  */
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-5.c b/gcc/testsuite/gcc.target/i386/funcspec-5.c
index 0acfe00..269e610 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-5.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c
@@ -24,6 +24,7 @@ extern void test_ssse3 (void)			__attribute__((__target__("ssse3")));
 extern void test_tbm (void)			__attribute__((__target__("tbm")));
 extern void test_avx (void)			__attribute__((__target__("avx")));
 extern void test_avx2 (void)			__attribute__((__target__("avx2")));
+extern void test_avx512 (void)			__attribute__((__target__("avx512")));
 
 extern void test_no_abm (void)			__attribute__((__target__("no-abm")));
 extern void test_no_aes (void)			__attribute__((__target__("no-aes")));
@@ -46,6 +47,7 @@ extern void test_no_ssse3 (void)		__attribute__((__target__("no-ssse3")));
 extern void test_no_tbm (void)			__attribute__((__target__("no-tbm")));
 extern void test_no_avx (void)			__attribute__((__target__("no-avx")));
 extern void test_no_avx2 (void)   		__attribute__((__target__("no-avx2")));
+extern void test_no_avx512 (void)   		__attribute__((__target__("no-avx512")));
 
 extern void test_arch_i386 (void)		__attribute__((__target__("arch=i386")));
 extern void test_arch_i486 (void)		__attribute__((__target__("arch=i486")));
@@ -70,6 +72,7 @@ extern void test_arch_core2 (void)		__attribute__((__target__("arch=core2")));
 extern void test_arch_corei7 (void)		__attribute__((__target__("arch=corei7")));
 extern void test_arch_corei7_avx (void)		__attribute__((__target__("arch=corei7-avx")));
 extern void test_arch_core_avx2 (void)		__attribute__((__target__("arch=core-avx2")));
+extern void test_arch_knl (void)		__attribute__((__target__("arch=knl")));
 extern void test_arch_geode (void)		__attribute__((__target__("arch=geode")));
 extern void test_arch_k6 (void)			__attribute__((__target__("arch=k6")));
 extern void test_arch_k6_2 (void)		__attribute__((__target__("arch=k6-2")));
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH x86] Add march/mtune=knl
  2014-12-10 16:21 [PATCH x86] Add march/mtune=knl Ilya Tocar
@ 2014-12-10 16:35 ` Uros Bizjak
  2015-01-24 11:53   ` Tom de Vries
  0 siblings, 1 reply; 4+ messages in thread
From: Uros Bizjak @ 2014-12-10 16:35 UTC (permalink / raw)
  To: Ilya Tocar; +Cc: GCC Patches

On Wed, Dec 10, 2014 at 5:20 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
> Hi,
>
> Patch bellow adds march/mtune/attribute=knl.
> For now this is just silvermont tuning and avx/avx2/avx512 support.
> Ok for trunk?
>
> gcc/
>         * config.gcc: Support "knl".
>         * config/i386/driver-i386.c (host_detect_local_cpu): Detect "knl".
>         * config/i386/i386-c.c (ix86_target_macros_internal): Handle
>         PROCESSOR_KNL.
>         * config/i386/i386.c (m_KNL): Define.
>         (processor_target_table): Add "knl".
>         (PTA_KNL): Define.
>         (ix86_issue_rate): Add PROCESSOR_KNL.
>         (ix86_adjust_cost): Ditto.
>         (ia32_multipass_dfa_lookahead): Ditto.
>         (get_builtin_code_for_version): Handle "knl".
>         (fold_builtin_cpu): Ditto.
>         * config/i386/i386.h (TARGET_KNL): Define.
>         (processor_type): Add PROCESSOR_KNL.
>         * config/i386/i386.md (attr "cpu"): Add knl.
>         * config/i386/x86-tune.def: Add m_KNL.
>
> gcc/testsuite/
>         * gcc.target/i386/funcspec-5.c: Test avx512f and knl.

OK with a small comment nit below.

Thanks,
Uros.

>
> ---
>  gcc/config.gcc                             |  3 +-
>  gcc/config/i386/driver-i386.c              |  6 +++-
>  gcc/config/i386/i386-c.c                   |  7 +++++
>  gcc/config/i386/i386.c                     | 17 ++++++++++-
>  gcc/config/i386/i386.h                     |  2 ++
>  gcc/config/i386/i386.md                    |  2 +-
>  gcc/config/i386/x86-tune.def               | 47 +++++++++++++++---------------
>  gcc/testsuite/gcc.target/i386/funcspec-5.c |  3 ++
>  8 files changed, 60 insertions(+), 27 deletions(-)
>
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index fa3e1fc..8541274 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -591,7 +591,8 @@ pentium4 pentium4m pentiumpro prescott"
>  x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
>  bdver3 bdver4 btver1 btver2 k8 k8-sse3 opteron opteron-sse3 nocona \
>  core2 corei7 corei7-avx core-avx-i core-avx2 atom slm nehalem westmere \
> -sandybridge ivybridge haswell broadwell bonnell silvermont x86-64 native"
> +sandybridge ivybridge haswell broadwell bonnell silvermont knl x86-64 \
> +native"
>
>  # Additional x86 processors supported by --with-cpu=.  Each processor
>  # MUST be separated by exactly one space.
> diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
> index a2248ce..69ebebd 100644
> --- a/gcc/config/i386/driver-i386.c
> +++ b/gcc/config/i386/driver-i386.c
> @@ -747,7 +747,11 @@ const char *host_detect_local_cpu (int argc, const char **argv)
>           if (arch)
>             {
>               /* This is unknown family 0x6 CPU.  */
> -             if (has_adx)
> +             /* Assume Knl.  */

/* Assume Knights Landing.  */

> +             if (has_avx512f)
> +               cpu = "knl";
> +             /* Assume Broadwell.  */
> +             else if (has_adx)
>                 cpu = "broadwell";
>               else if (has_avx2)
>                 /* Assume Haswell.  */
> diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
> index 3ad7d49..1c604fc3 100644
> --- a/gcc/config/i386/i386-c.c
> +++ b/gcc/config/i386/i386-c.c
> @@ -171,6 +171,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
>        def_or_undef (parse_in, "__silvermont");
>        def_or_undef (parse_in, "__silvermont__");
>        break;
> +    case PROCESSOR_KNL:
> +      def_or_undef (parse_in, "__knl");
> +      def_or_undef (parse_in, "__knl__");
> +      break;
>      /* use PROCESSOR_max to not set/unset the arch macro.  */
>      case PROCESSOR_max:
>        break;
> @@ -277,6 +281,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
>        def_or_undef (parse_in, "__tune_slm__");
>        def_or_undef (parse_in, "__tune_silvermont__");
>        break;
> +    case PROCESSOR_KNL:
> +      def_or_undef (parse_in, "__tune_knl__");
> +      break;
>      case PROCESSOR_INTEL:
>      case PROCESSOR_GENERIC:
>        break;
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 1e1716e..f0cbe48 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -2040,6 +2040,7 @@ const struct processor_costs *ix86_cost = &pentium_cost;
>  #define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_HASWELL)
>  #define m_BONNELL (1<<PROCESSOR_BONNELL)
>  #define m_SILVERMONT (1<<PROCESSOR_SILVERMONT)
> +#define m_KNL (1<<PROCESSOR_KNL)
>  #define m_INTEL (1<<PROCESSOR_INTEL)
>
>  #define m_GEODE (1<<PROCESSOR_GEODE)
> @@ -2505,6 +2506,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
>    {"haswell", &core_cost, 16, 10, 16, 10, 16},
>    {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
>    {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
> +  {"knl", &slm_cost, 16, 15, 16, 7, 16},
>    {"intel", &intel_cost, 16, 15, 16, 7, 16},
>    {"geode", &geode_cost, 0, 0, 0, 0, 0},
>    {"k6", &k6_cost, 32, 7, 32, 7, 32},
> @@ -3178,6 +3180,8 @@ ix86_option_override_internal (bool main_args_p,
>     | PTA_FMA | PTA_MOVBE | PTA_HLE)
>  #define PTA_BROADWELL \
>    (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
> +#define PTA_KNL \
> +  (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
>  #define PTA_BONNELL \
>    (PTA_CORE2 | PTA_MOVBE)
>  #define PTA_SILVERMONT \
> @@ -3241,6 +3245,7 @@ ix86_option_override_internal (bool main_args_p,
>        {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
>        {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
>        {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
> +      {"knl", PROCESSOR_KNL, CPU_KNL, PTA_KNL},
>        {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
>        {"geode", PROCESSOR_GEODE, CPU_GEODE,
>         PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
> @@ -25934,6 +25939,7 @@ ix86_issue_rate (void)
>      case PROCESSOR_PENTIUM:
>      case PROCESSOR_BONNELL:
>      case PROCESSOR_SILVERMONT:
> +    case PROCESSOR_KNL:
>      case PROCESSOR_INTEL:
>      case PROCESSOR_K6:
>      case PROCESSOR_BTVER2:
> @@ -26276,6 +26282,7 @@ ix86_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
>        break;
>
>      case PROCESSOR_SILVERMONT:
> +    case PROCESSOR_KNL:
>      case PROCESSOR_INTEL:
>        if (!reload_completed)
>         return cost;
> @@ -26345,6 +26352,7 @@ ia32_multipass_dfa_lookahead (void)
>      case PROCESSOR_HASWELL:
>      case PROCESSOR_BONNELL:
>      case PROCESSOR_SILVERMONT:
> +    case PROCESSOR_KNL:
>      case PROCESSOR_INTEL:
>        /* Generally, we want haifa-sched:max_issue() to look ahead as far
>          as many instructions can be executed on a cycle, i.e.,
> @@ -34246,7 +34254,8 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
>      P_PROC_FMA,
>      P_AVX2,
>      P_PROC_AVX2,
> -    P_AVX512F
> +    P_AVX512F,
> +    P_PROC_AVX512F
>    };
>
>   enum feature_priority priority = P_ZERO;
> @@ -34350,6 +34359,10 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
>               arg_str = "bonnell";
>               priority = P_PROC_SSSE3;
>               break;
> +           case PROCESSOR_KNL:
> +             arg_str = "knl";
> +             priority = P_PROC_AVX512F;
> +             break;
>             case PROCESSOR_SILVERMONT:
>               arg_str = "silvermont";
>               priority = P_PROC_SSE4_2;
> @@ -35268,6 +35281,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>      M_AMDFAM10H,
>      M_AMDFAM15H,
>      M_INTEL_SILVERMONT,
> +    M_INTEL_KNL,
>      M_AMD_BTVER1,
>      M_AMD_BTVER2,
>      M_CPU_SUBTYPE_START,
> @@ -35305,6 +35319,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>        {"haswell", M_INTEL_COREI7_HASWELL},
>        {"bonnell", M_INTEL_BONNELL},
>        {"silvermont", M_INTEL_SILVERMONT},
> +      {"knl", M_INTEL_KNL},
>        {"amdfam10h", M_AMDFAM10H},
>        {"barcelona", M_AMDFAM10H_BARCELONA},
>        {"shanghai", M_AMDFAM10H_SHANGHAI},
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index df7789d..7c35758 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -337,6 +337,7 @@ extern const struct processor_costs ix86_size_cost;
>  #define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL)
>  #define TARGET_BONNELL (ix86_tune == PROCESSOR_BONNELL)
>  #define TARGET_SILVERMONT (ix86_tune == PROCESSOR_SILVERMONT)
> +#define TARGET_KNL (ix86_tune == PROCESSOR_KNL)
>  #define TARGET_INTEL (ix86_tune == PROCESSOR_INTEL)
>  #define TARGET_GENERIC (ix86_tune == PROCESSOR_GENERIC)
>  #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
> @@ -2272,6 +2273,7 @@ enum processor_type
>    PROCESSOR_HASWELL,
>    PROCESSOR_BONNELL,
>    PROCESSOR_SILVERMONT,
> +  PROCESSOR_KNL,
>    PROCESSOR_INTEL,
>    PROCESSOR_GEODE,
>    PROCESSOR_K6,
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 9019ed8..7ae511c 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -399,7 +399,7 @@
>  ;; Processor type.
>  (define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
>                     atom,slm,generic,amdfam10,bdver1,bdver2,bdver3,bdver4,
> -                   btver2"
> +                   btver2,knl"
>    (const (symbol_ref "ix86_schedule")))
>
>  ;; A basic instruction type.  Refinements due to arguments to be
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index b5c6e4f..db43b3d 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -41,7 +41,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>  /* X86_TUNE_SCHEDULE: Enable scheduling.  */
>  DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
>            m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
> -         | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
> +         | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
>
>  /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
>     on modern chips.  Preffer stores affecting whole integer register
> @@ -49,7 +49,7 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
>     value over movb.  */
>  DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
>            m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
> -         | m_AMD_MULTIPLE | m_GENERIC)
> +         | m_KNL | m_AMD_MULTIPLE | m_GENERIC)
>
>  /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
>     destinations to be 128bit to allow register renaming on 128bit SSE units,
> @@ -85,13 +85,13 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
>     partial dependencies.  */
>  DEF_TUNE (X86_TUNE_MOVX, "movx",
>            m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> -         | m_INTEL | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC)
> +         | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC)
>
>  /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
>     full sized loads.  */
>  DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
>            m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
> -         | m_AMD_MULTIPLE | m_GENERIC)
> +         | m_KNL | m_AMD_MULTIPLE | m_GENERIC)
>
>  /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
>     conditional jump instruction for 32 bit TARGET.
> @@ -125,7 +125,7 @@ DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
>  /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
>     during reassociation of fp computation.  */
>  DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
> -          m_BONNELL | m_SILVERMONT | m_HASWELL | m_INTEL | m_BDVER1
> +          m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL |m_INTEL | m_BDVER1
>           | m_BDVER2 | m_GENERIC)
>
>  /*****************************************************************************/
> @@ -145,7 +145,7 @@ DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
>     regression on mgrid due to IRA limitation leading to unecessary
>     use of the frame pointer in 32bit mode.  */
>  DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
> -         m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
> +         m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
>           | m_ATHLON_K8)
>
>  /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
> @@ -205,7 +205,7 @@ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
>  /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
>     than 4 branch instructions in the 16 byte window.  */
>  DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
> -          m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL |
> +          m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL |m_INTEL |
>           m_ATHLON_K8 | m_AMDFAM10)
>
>  /*****************************************************************************/
> @@ -229,21 +229,22 @@ DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
>  /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.   */
>  DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
>            ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
> -           | m_GENERIC))
> +          |  m_KNL | m_GENERIC))
>
>  /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
>     for DFmode copies */
>  DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
>            ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> -           | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
> +           | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
>
>  /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>     will impact LEA instruction selection. */
> -DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_INTEL)
> +DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
> +         | m_INTEL)
>
>  /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
>  DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
> -         m_BONNELL | m_SILVERMONT)
> +         m_BONNELL | m_SILVERMONT | m_KNL)
>
>  /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
>     vector path on AMD machines.
> @@ -260,7 +261,7 @@ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
>  /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
>     a conditional move.  */
>  DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
> -         m_BONNELL | m_SILVERMONT | m_INTEL)
> +         m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL)
>
>  /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
>     as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
> @@ -278,17 +279,17 @@ DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
>  /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
>  DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
>            m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> -         | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
> -         | m_GENERIC)
> +         | m_KNL | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
> +         | m_BTVER | m_GENERIC)
>
>  /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
>  DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
> -         ~(m_PENT | m_BONNELL | m_SILVERMONT | m_INTEL  | m_K6))
> +         ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL  | m_K6))
>
>  /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
>  DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
> -          m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE
> -         | m_GENERIC)
> +          m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
> +         | m_AMD_MULTIPLE | m_GENERIC)
>
>  /*****************************************************************************/
>  /* 387 instruction selection tuning                                          */
> @@ -304,7 +305,7 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
>     integer operand.  */
>  DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
>            ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> -           | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
> +           | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
>
>  /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
>  DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
> @@ -312,7 +313,7 @@ DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
>  /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
>  DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
>            m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> -         | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
> +         | m_KNL | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
>
>  /*****************************************************************************/
>  /* SSE instruction selection tuning                                          */
> @@ -331,13 +332,13 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
>     of a sequence loading registers by parts.  */
>  DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
>            m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER
> -         | m_BTVER | m_SILVERMONT | m_INTEL | m_GENERIC)
> +         | m_BTVER | m_SILVERMONT | m_KNL | m_INTEL | m_GENERIC)
>
>  /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
>     of a sequence loading registers by parts.  */
>  DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
>            m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT
> -         | m_INTEL | m_GENERIC)
> +         | m_KNL | m_INTEL | m_GENERIC)
>
>  /* Use packed single precision instructions where posisble.  I.e. movups instead
>     of movupd.  */
> @@ -374,7 +375,7 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
>  /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
>     fp converts to destination register.  */
>  DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
> -          m_SILVERMONT | m_INTEL)
> +          m_SILVERMONT | m_KNL | m_INTEL)
>
>  /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
>     from FP to FP.  This form of instructions avoids partial write to the
> @@ -388,7 +389,7 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
>
>  /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
>  DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
> -          m_BONNELL | m_SILVERMONT | m_INTEL)
> +          m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL)
>
>  /* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
>     execute 2 or more vector instructions in parallel.  */
> diff --git a/gcc/testsuite/gcc.target/i386/funcspec-5.c b/gcc/testsuite/gcc.target/i386/funcspec-5.c
> index 0acfe00..269e610 100644
> --- a/gcc/testsuite/gcc.target/i386/funcspec-5.c
> +++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c
> @@ -24,6 +24,7 @@ extern void test_ssse3 (void)                 __attribute__((__target__("ssse3")));
>  extern void test_tbm (void)                    __attribute__((__target__("tbm")));
>  extern void test_avx (void)                    __attribute__((__target__("avx")));
>  extern void test_avx2 (void)                   __attribute__((__target__("avx2")));
> +extern void test_avx512 (void)                 __attribute__((__target__("avx512")));
>
>  extern void test_no_abm (void)                 __attribute__((__target__("no-abm")));
>  extern void test_no_aes (void)                 __attribute__((__target__("no-aes")));
> @@ -46,6 +47,7 @@ extern void test_no_ssse3 (void)              __attribute__((__target__("no-ssse3")));
>  extern void test_no_tbm (void)                 __attribute__((__target__("no-tbm")));
>  extern void test_no_avx (void)                 __attribute__((__target__("no-avx")));
>  extern void test_no_avx2 (void)                __attribute__((__target__("no-avx2")));
> +extern void test_no_avx512 (void)              __attribute__((__target__("no-avx512")));
>
>  extern void test_arch_i386 (void)              __attribute__((__target__("arch=i386")));
>  extern void test_arch_i486 (void)              __attribute__((__target__("arch=i486")));
> @@ -70,6 +72,7 @@ extern void test_arch_core2 (void)            __attribute__((__target__("arch=core2")));
>  extern void test_arch_corei7 (void)            __attribute__((__target__("arch=corei7")));
>  extern void test_arch_corei7_avx (void)                __attribute__((__target__("arch=corei7-avx")));
>  extern void test_arch_core_avx2 (void)         __attribute__((__target__("arch=core-avx2")));
> +extern void test_arch_knl (void)               __attribute__((__target__("arch=knl")));
>  extern void test_arch_geode (void)             __attribute__((__target__("arch=geode")));
>  extern void test_arch_k6 (void)                        __attribute__((__target__("arch=k6")));
>  extern void test_arch_k6_2 (void)              __attribute__((__target__("arch=k6-2")));
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH x86] Add march/mtune=knl
  2014-12-10 16:35 ` Uros Bizjak
@ 2015-01-24 11:53   ` Tom de Vries
  2015-01-24 12:13     ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Tom de Vries @ 2015-01-24 11:53 UTC (permalink / raw)
  To: Uros Bizjak, Ilya Tocar; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 954 bytes --]

On 10-12-14 17:35, Uros Bizjak wrote:
> On Wed, Dec 10, 2014 at 5:20 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote:

>> gcc/testsuite/
>>          * gcc.target/i386/funcspec-5.c: Test avx512f and knl.
>

>> --- a/gcc/testsuite/gcc.target/i386/funcspec-5.c
>> +++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c

>> +extern void test_avx512 (void)                 __attribute__((__target__("avx512")));

>> +extern void test_no_avx512 (void)              __attribute__((__target__("no-avx512")));

funcspec-5.c is currently failing (mentioned in PR64342) with:
...
Excess errors:
src/gcc/testsuite/gcc.target/i386/funcspec-5.c:27:1: error: 
attribute(target("avx512")) is unknown
src/gcc/testsuite/gcc.target/i386/funcspec-5.c:50:1: error: 
attribute(target("no-avx512")) is unknown
...

Given the used of avx512f in the ChangeLog entry, I assume avx512f was meant in 
the attributes instead of avx512?

Attached patch ok for stage4 trunk?

Thanks,
- Tom



[-- Attachment #2: 0001-Fix-avx512f-spec-in-funcspec-5.c.patch --]
[-- Type: text/x-patch, Size: 1684 bytes --]

2015-01-24  Tom de Vries  <tom@codesourcery.com>

	* gcc.target/i386/funcspec-5.c: Replace avx512 with avx512f.
---
 gcc/testsuite/gcc.target/i386/funcspec-5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/funcspec-5.c b/gcc/testsuite/gcc.target/i386/funcspec-5.c
index 269e610..d796484 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-5.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c
@@ -24,7 +24,7 @@ extern void test_ssse3 (void)			__attribute__((__target__("ssse3")));
 extern void test_tbm (void)			__attribute__((__target__("tbm")));
 extern void test_avx (void)			__attribute__((__target__("avx")));
 extern void test_avx2 (void)			__attribute__((__target__("avx2")));
-extern void test_avx512 (void)			__attribute__((__target__("avx512")));
+extern void test_avx512f (void)			__attribute__((__target__("avx512f")));
 
 extern void test_no_abm (void)			__attribute__((__target__("no-abm")));
 extern void test_no_aes (void)			__attribute__((__target__("no-aes")));
@@ -47,7 +47,7 @@ extern void test_no_ssse3 (void)		__attribute__((__target__("no-ssse3")));
 extern void test_no_tbm (void)			__attribute__((__target__("no-tbm")));
 extern void test_no_avx (void)			__attribute__((__target__("no-avx")));
 extern void test_no_avx2 (void)   		__attribute__((__target__("no-avx2")));
-extern void test_no_avx512 (void)   		__attribute__((__target__("no-avx512")));
+extern void test_no_avx512f (void)   		__attribute__((__target__("no-avx512f")));
 
 extern void test_arch_i386 (void)		__attribute__((__target__("arch=i386")));
 extern void test_arch_i486 (void)		__attribute__((__target__("arch=i486")));
-- 
1.9.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH x86] Add march/mtune=knl
  2015-01-24 11:53   ` Tom de Vries
@ 2015-01-24 12:13     ` Uros Bizjak
  0 siblings, 0 replies; 4+ messages in thread
From: Uros Bizjak @ 2015-01-24 12:13 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Ilya Tocar, GCC Patches

On Sat, Jan 24, 2015 at 11:40 AM, Tom de Vries <Tom_deVries@mentor.com> wrote:
> On 10-12-14 17:35, Uros Bizjak wrote:
>>
>> On Wed, Dec 10, 2014 at 5:20 PM, Ilya Tocar <tocarip.intel@gmail.com>
>> wrote:
>
>
>>> gcc/testsuite/
>>>          * gcc.target/i386/funcspec-5.c: Test avx512f and knl.
>>
>>
>
>>> --- a/gcc/testsuite/gcc.target/i386/funcspec-5.c
>>> +++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c
>
>
>>> +extern void test_avx512 (void)
>>> __attribute__((__target__("avx512")));
>
>
>>> +extern void test_no_avx512 (void)
>>> __attribute__((__target__("no-avx512")));
>
>
> funcspec-5.c is currently failing (mentioned in PR64342) with:
> ...
> Excess errors:
> src/gcc/testsuite/gcc.target/i386/funcspec-5.c:27:1: error:
> attribute(target("avx512")) is unknown
> src/gcc/testsuite/gcc.target/i386/funcspec-5.c:50:1: error:
> attribute(target("no-avx512")) is unknown
> ...
>
> Given the used of avx512f in the ChangeLog entry, I assume avx512f was meant
> in the attributes instead of avx512?
>
> Attached patch ok for stage4 trunk?

OK.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-01-24 10:48 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-10 16:21 [PATCH x86] Add march/mtune=knl Ilya Tocar
2014-12-10 16:35 ` Uros Bizjak
2015-01-24 11:53   ` Tom de Vries
2015-01-24 12:13     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).