public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-8006] x86: Update memcpy/memset inline strategies for Skylake family CPUs
@ 2021-04-06 12:41 H.J. Lu
0 siblings, 0 replies; only message in thread
From: H.J. Lu @ 2021-04-06 12:41 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:a32452a5442cd05040af53787af0d8b537ac77a6
commit r11-8006-ga32452a5442cd05040af53787af0d8b537ac77a6
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Mar 11 16:56:26 2021 -0800
x86: Update memcpy/memset inline strategies for Skylake family CPUs
Simply memcpy and memset inline strategies to avoid branches for
Skylake family CPUs:
1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
load and store for up to 16 * 16 (256) bytes when the data size is
fixed and known.
2. Inline only if data size is known to be <= 256.
a. Use "rep movsb/stosb" with simple code sequence if the data size
is a constant.
b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.
On Cascadelake processor with -march=native -Ofast -flto,
1. Performance impacts of SPEC CPU 2017 rate are:
500.perlbench_r 0.17%
502.gcc_r -0.36%
505.mcf_r 0.00%
520.omnetpp_r 0.08%
523.xalancbmk_r -0.62%
525.x264_r 1.04%
531.deepsjeng_r 0.11%
541.leela_r -1.09%
548.exchange2_r -0.25%
557.xz_r 0.17%
Geomean -0.08%
503.bwaves_r 0.00%
507.cactuBSSN_r 0.69%
508.namd_r -0.07%
510.parest_r 1.12%
511.povray_r 1.82%
519.lbm_r 0.00%
521.wrf_r -1.32%
526.blender_r -0.47%
527.cam4_r 0.23%
538.imagick_r -1.72%
544.nab_r -0.56%
549.fotonik3d_r 0.12%
554.roms_r 0.43%
Geomean 0.02%
2. Significant impacts on eembc benchmarks are:
eembc/idctrn01 9.23%
eembc/nnet_test 29.26%
gcc/
* config/i386/x86-tune-costs.h (skylake_memcpy): Updated.
(skylake_memset): Likewise.
(skylake_cost): Change CLEAR_RATIO to 17.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER,
m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512.
gcc/testsuite/
* gcc.target/i386/memcpy-strategy-9.c: New test.
* gcc.target/i386/memcpy-strategy-10.c: Likewise.
* gcc.target/i386/memcpy-strategy-11.c: Likewise.
* gcc.target/i386/memset-strategy-7.c: Likewise.
* gcc.target/i386/memset-strategy-8.c: Likewise.
* gcc.target/i386/memset-strategy-9.c: Likewise.
Diff:
---
gcc/config/i386/x86-tune-costs.h | 27 ++++++++++++++--------
gcc/config/i386/x86-tune.def | 3 +--
gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c | 11 +++++++++
gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c | 18 +++++++++++++++
gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c | 9 ++++++++
gcc/testsuite/gcc.target/i386/memset-strategy-7.c | 11 +++++++++
gcc/testsuite/gcc.target/i386/memset-strategy-8.c | 9 ++++++++
gcc/testsuite/gcc.target/i386/memset-strategy-9.c | 17 ++++++++++++++
8 files changed, 93 insertions(+), 12 deletions(-)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 0e00ff99df3..ffe810f2bcb 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = {
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
- {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
- {libcall, {{16, loop, false}, {512, unrolled_loop, false},
- {-1, libcall, false}}}};
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
static stringop_algs skylake_memset[2] = {
- {libcall, {{6, loop_1_byte, true},
- {24, loop, true},
- {8192, rep_prefix_4_byte, true},
- {-1, libcall, false}}},
- {libcall, {{24, loop, true}, {512, unrolled_loop, false},
- {-1, libcall, false}}}};
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
static const
struct processor_costs skylake_cost = {
@@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (0), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
- 6, /* CLEAR_RATIO */
+ 17, /* CLEAR_RATIO */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 134916cc972..eb057a67750 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
move/set sequences of bytes with known size. */
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
"prefer_known_rep_movsb_stosb",
- m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
- | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+ m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
compact prologues and epilogues by issuing a misaligned moves. This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
new file mode 100644
index 00000000000..970aa741971
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
new file mode 100644
index 00000000000..b6041944630
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+ e_u8 b[4][MAXBC];
+ int i, j;
+
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < BC; j++) a[i][j] = b[i][j];
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
new file mode 100644
index 00000000000..b0dc7484d09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
new file mode 100644
index 00000000000..07c2816910c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
new file mode 100644
index 00000000000..52ea882c814
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
new file mode 100644
index 00000000000..d4db031958f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+ int i, j;
+
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < BC; j++) a[i][j] = 1;
+}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2021-04-06 12:41 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-06 12:41 [gcc r11-8006] x86: Update memcpy/memset inline strategies for Skylake family CPUs H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).