From: Vladimir Yakovlev <vbyakovl23@gmail.com>
To: Uros Bizjak <ubizjak@gmail.com>, jakub@redhat.com
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [RFC, x86] Changes for AVX and AVX2 processors
Date: Fri, 11 Jan 2013 12:15:00 -0000 [thread overview]
Message-ID: <CAK1BsWo+gd6jXwtuNgHVCAWw07EsjiXSVgjUF66j68cAW2Sj8A@mail.gmail.com> (raw)
In-Reply-To: <20130111112745.GG7269@tucnak.redhat.com>
[-- Attachment #1: Type: text/plain, Size: 1272 bytes --]
I sent the patch. Send it once more.
2013/1/11 Jakub Jelinek <jakub@redhat.com>:
> On Fri, Jan 11, 2013 at 03:25:47PM +0400, Vladimir Yakovlev wrote:
>> I've fixed Changelog. Can we commit the patch to trunk now?
>>
>> 2012-12-27 Vladimir Yakovlev <vladimir.b.yakovlev@intel.com>
>>
>> * config/i386/i386-c.c (ix86_target_macros_internal): New case.
>> (ix86_target_macros_internal): Likewise.
>>
>> * config/i386/i386.c (m_CORE2I7): Removed.
>> (m_CORE_HASWELL): New macro.
>> (m_CORE_ALL): Likewise.
>> (initial_ix86_tune_features): m_CORE2I7 is replaced by m_CORE_ALL.
>> (initial_ix86_arch_features): Likewise.
>> (processor_target_table): Initializations for Core avx2.
>> (cpu_names): New names "core-avx2".
>> (ix86_option_override_internal): Changed PROCESSOR_COREI7 by
>> PROCESSOR_CORE_HASWELL.
>> (ix86_issue_rate): New case.
>> (ia32_multipass_dfa_lookahead): Likewise.
>> (ix86_sched_init_global): Likewise.
>>
>> * config/i386/i386.h (TARGET_HASWELL): New macro.
>> (target_cpu_default): New TARGET_CPU_DEFAULT_haswell.
>> (processor_type): New PROCESSOR_HASWELL.
>
> Uros already acked the patch, so it certainly is ok to commit now.
>
> Jakub
[-- Attachment #2: patch1 --]
[-- Type: application/octet-stream, Size: 12723 bytes --]
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 22e5e9b..2d8abd5 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -142,6 +142,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__corei7");
def_or_undef (parse_in, "__corei7__");
break;
+ case PROCESSOR_HASWELL:
+ def_or_undef (parse_in, "__core_avx2");
+ def_or_undef (parse_in, "__core_avx2__");
+ break;
case PROCESSOR_ATOM:
def_or_undef (parse_in, "__atom");
def_or_undef (parse_in, "__atom__");
@@ -232,6 +236,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_COREI7:
def_or_undef (parse_in, "__tune_corei7__");
break;
+ case PROCESSOR_HASWELL:
+ def_or_undef (parse_in, "__tune_core_avx2__");
+ break;
case PROCESSOR_ATOM:
def_or_undef (parse_in, "__tune_atom__");
break;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 69f44aa..cdb5d23 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1732,7 +1732,8 @@ const struct processor_costs *ix86_cost = &pentium_cost;
#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
#define m_CORE2 (1<<PROCESSOR_CORE2)
#define m_COREI7 (1<<PROCESSOR_COREI7)
-#define m_CORE2I7 (m_CORE2 | m_COREI7)
+#define m_HASWELL (1<<PROCESSOR_HASWELL)
+#define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
#define m_ATOM (1<<PROCESSOR_ATOM)
#define m_GEODE (1<<PROCESSOR_GEODE)
@@ -1768,16 +1769,16 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
negatively, so enabling for Generic64 seems like good code size
tradeoff. We can't enable it for 32bit generic because it does not
work well with PPro base chips. */
- m_386 | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
+ m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
/* X86_TUNE_PUSH_MEMORY */
- m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_ZERO_EXTEND_WITH_AND */
m_486 | m_PENT,
/* X86_TUNE_UNROLL_STRLEN */
- m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
+ m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
on simulation result. But after P4 was made, no performance benefit
@@ -1789,11 +1790,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
~m_386,
/* X86_TUNE_USE_SAHF */
- m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
partial dependencies. */
- m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
register stalls on Generic32 compilation setting as well. However
@@ -1806,17 +1807,17 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_PPRO,
/* X86_TUNE_PARTIAL_FLAG_REG_STALL */
- m_CORE2I7 | m_GENERIC,
+ m_CORE_ALL | m_GENERIC,
/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
* on 16-bit immediate moves into memory on Core2 and Corei7. */
- m_CORE2I7 | m_GENERIC,
+ m_CORE_ALL | m_GENERIC,
/* X86_TUNE_USE_HIMODE_FIOP */
m_386 | m_486 | m_K6_GEODE,
/* X86_TUNE_USE_SIMODE_FIOP */
- ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
+ ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
/* X86_TUNE_USE_MOV0 */
m_K6,
@@ -1837,7 +1838,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
~(m_PENT | m_PPRO),
/* X86_TUNE_PROMOTE_QIMODE */
- m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_FAST_PREFIX */
~(m_386 | m_486 | m_PENT),
@@ -1878,10 +1879,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
for DFmode copies */
- ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
+ ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
/* X86_TUNE_PARTIAL_REG_DEPENDENCY */
- m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
+ m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
conflict here in between PPro/Pentium4 based chips that thread 128bit
@@ -1892,7 +1893,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
shows that disabling this option on P4 brings over 20% SPECfp regression,
while enabling it on K8 brings roughly 2.4% regression that can be partly
masked by careful scheduling of moves. */
- m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
@@ -1916,7 +1917,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_PPRO | m_P4_NOCONA,
/* X86_TUNE_MEMORY_MISMATCH_STALL */
- m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
+ m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_PROLOGUE_USING_MOVE */
m_PPRO | m_ATHLON_K8,
@@ -1938,28 +1939,28 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
- m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_SCHEDULE */
- m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_USE_BT */
- m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
+ m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_USE_INCDEC */
- ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
+ ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
/* X86_TUNE_PAD_RETURNS */
- m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
+ m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
m_ATOM,
/* X86_TUNE_EXT_80387_CONSTANTS */
- m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
/* X86_TUNE_AVOID_VECTOR_DECODE */
- m_CORE2I7 | m_K8 | m_GENERIC64,
+ m_CORE_ALL | m_K8 | m_GENERIC64,
/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
and SImode multiply, but 386 and 486 do HImode multiply faster. */
@@ -1967,11 +1968,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
vector path on AMD machines. */
- m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
+ m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
machines. */
- m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
+ m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
than a MOV. */
@@ -1988,7 +1989,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
from FP to FP. */
- m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
+ m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
from integer to FP. */
@@ -2026,7 +2027,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
regs instead of memory. */
- m_COREI7 | m_CORE2I7
+ m_CORE_ALL
};
/* Feature tests against the various architecture variations. */
@@ -2052,10 +2053,10 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
};
static const unsigned int x86_accumulate_outgoing_args
- = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
+ = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
static const unsigned int x86_arch_always_fancy_math_387
- = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
+ = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
static const unsigned int x86_avx256_split_unaligned_load
= m_COREI7 | m_GENERIC;
@@ -2436,6 +2437,8 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
{&core_cost, 16, 10, 16, 10, 16},
/* Core i7 */
{&core_cost, 16, 10, 16, 10, 16},
+ /* Core avx2 */
+ {&core_cost, 16, 10, 16, 10, 16},
{&generic32_cost, 16, 7, 16, 7, 16},
{&generic64_cost, 16, 10, 16, 10, 16},
{&amdfam10_cost, 32, 24, 32, 7, 32},
@@ -2463,6 +2466,7 @@ static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
"nocona",
"core2",
"corei7",
+ "core-avx2",
"atom",
"geode",
"k6",
@@ -2914,7 +2918,7 @@ ix86_option_override_internal (bool main_args_p)
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
| PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
- {"core-avx2", PROCESSOR_COREI7, CPU_COREI7,
+ {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
@@ -24061,6 +24065,7 @@ ix86_issue_rate (void)
case PROCESSOR_PENTIUM4:
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
+ case PROCESSOR_HASWELL:
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
@@ -24317,6 +24322,7 @@ ia32_multipass_dfa_lookahead (void)
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
+ case PROCESSOR_HASWELL:
case PROCESSOR_ATOM:
/* Generally, we want haifa-sched:max_issue() to look ahead as far
as many instructions can be executed on a cycle, i.e.,
@@ -24861,6 +24867,7 @@ ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
{
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
+ case PROCESSOR_HASWELL:
/* Do not perform multipass scheduling for pre-reload schedule
to save compile time. */
if (reload_completed)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3ac3451..ee21c47 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -248,6 +248,7 @@ extern const struct processor_costs ix86_size_cost;
#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2)
#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7)
+#define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL)
#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
@@ -603,6 +604,7 @@ enum target_cpu_default
TARGET_CPU_DEFAULT_nocona,
TARGET_CPU_DEFAULT_core2,
TARGET_CPU_DEFAULT_corei7,
+ TARGET_CPU_DEFAULT_haswell,
TARGET_CPU_DEFAULT_atom,
TARGET_CPU_DEFAULT_geode,
@@ -2095,6 +2097,7 @@ enum processor_type
PROCESSOR_NOCONA,
PROCESSOR_CORE2,
PROCESSOR_COREI7,
+ PROCESSOR_HASWELL,
PROCESSOR_GENERIC32,
PROCESSOR_GENERIC64,
PROCESSOR_AMDFAM10,
next prev parent reply other threads:[~2013-01-11 12:15 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-12-28 13:36 Uros Bizjak
2012-12-29 5:26 ` Vladimir Yakovlev
2012-12-29 10:07 ` Uros Bizjak
[not found] ` <CAK1BsWpUdUg+ivi7pFdbUr8R45YjhbBCNhmN=98sMmW99dy-tg@mail.gmail.com>
2012-12-29 16:57 ` Vladimir Yakovlev
2012-12-30 13:21 ` Uros Bizjak
2012-12-30 16:05 ` Vladimir Yakovlev
2012-12-30 18:05 ` Uros Bizjak
2013-01-10 11:12 ` Vladimir Yakovlev
2013-01-10 11:28 ` Uros Bizjak
2013-01-10 11:31 ` Jakub Jelinek
2013-01-11 11:25 ` Vladimir Yakovlev
2013-01-11 11:27 ` Jakub Jelinek
2013-01-11 12:15 ` Vladimir Yakovlev [this message]
2013-01-11 12:21 ` Uros Bizjak
2013-01-11 12:38 ` Vladimir Yakovlev
2013-01-15 10:08 ` Kirill Yukhin
2012-12-30 11:59 ` Uros Bizjak
-- strict thread matches above, loose matches on Subject: below --
2012-12-27 17:07 Vladimir Yakovlev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAK1BsWo+gd6jXwtuNgHVCAWw07EsjiXSVgjUF66j68cAW2Sj8A@mail.gmail.com \
--to=vbyakovl23@gmail.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=jakub@redhat.com \
--cc=ubizjak@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).