public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-8006] x86: Update memcpy/memset inline strategies for Skylake family CPUs
@ 2021-04-06 12:41 H.J. Lu
  0 siblings, 0 replies; only message in thread
From: H.J. Lu @ 2021-04-06 12:41 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:a32452a5442cd05040af53787af0d8b537ac77a6

commit r11-8006-ga32452a5442cd05040af53787af0d8b537ac77a6
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 11 16:56:26 2021 -0800

    x86: Update memcpy/memset inline strategies for Skylake family CPUs
    
    Simply memcpy and memset inline strategies to avoid branches for
    Skylake family CPUs:
    
    1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
       load and store for up to 16 * 16 (256) bytes when the data size is
       fixed and known.
    2. Inline only if data size is known to be <= 256.
       a. Use "rep movsb/stosb" with simple code sequence if the data size
          is a constant.
       b. Use loop if data size is not a constant.
    3. Use memcpy/memset libray function if data size is unknown or > 256.
    
    On Cascadelake processor with -march=native -Ofast -flto,
    
    1. Performance impacts of SPEC CPU 2017 rate are:
    
    500.perlbench_r  0.17%
    502.gcc_r       -0.36%
    505.mcf_r        0.00%
    520.omnetpp_r    0.08%
    523.xalancbmk_r -0.62%
    525.x264_r       1.04%
    531.deepsjeng_r  0.11%
    541.leela_r     -1.09%
    548.exchange2_r -0.25%
    557.xz_r         0.17%
    Geomean         -0.08%
    
    503.bwaves_r     0.00%
    507.cactuBSSN_r  0.69%
    508.namd_r      -0.07%
    510.parest_r     1.12%
    511.povray_r     1.82%
    519.lbm_r        0.00%
    521.wrf_r       -1.32%
    526.blender_r   -0.47%
    527.cam4_r       0.23%
    538.imagick_r   -1.72%
    544.nab_r       -0.56%
    549.fotonik3d_r  0.12%
    554.roms_r       0.43%
    Geomean          0.02%
    
    2. Significant impacts on eembc benchmarks are:
    
    eembc/idctrn01   9.23%
    eembc/nnet_test  29.26%
    
    gcc/
    
            * config/i386/x86-tune-costs.h (skylake_memcpy): Updated.
            (skylake_memset): Likewise.
            (skylake_cost): Change CLEAR_RATIO to 17.
            * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
            Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER,
            m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512.
    
    gcc/testsuite/
    
            * gcc.target/i386/memcpy-strategy-9.c: New test.
            * gcc.target/i386/memcpy-strategy-10.c: Likewise.
            * gcc.target/i386/memcpy-strategy-11.c: Likewise.
            * gcc.target/i386/memset-strategy-7.c: Likewise.
            * gcc.target/i386/memset-strategy-8.c: Likewise.
            * gcc.target/i386/memset-strategy-9.c: Likewise.

Diff:
---
 gcc/config/i386/x86-tune-costs.h                   | 27 ++++++++++++++--------
 gcc/config/i386/x86-tune.def                       |  3 +--
 gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c | 11 +++++++++
 gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c | 18 +++++++++++++++
 gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c  |  9 ++++++++
 gcc/testsuite/gcc.target/i386/memset-strategy-7.c  | 11 +++++++++
 gcc/testsuite/gcc.target/i386/memset-strategy-8.c  |  9 ++++++++
 gcc/testsuite/gcc.target/i386/memset-strategy-9.c  | 17 ++++++++++++++
 8 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 0e00ff99df3..ffe810f2bcb 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = {
 
 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
 static stringop_algs skylake_memcpy[2] =   {
-  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
-  {libcall, {{16, loop, false}, {512, unrolled_loop, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 
 static stringop_algs skylake_memset[2] = {
-  {libcall, {{6, loop_1_byte, true},
-             {24, loop, true},
-             {8192, rep_prefix_4_byte, true},
-             {-1, libcall, false}}},
-  {libcall, {{24, loop, true}, {512, unrolled_loop, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 
 static const
 struct processor_costs skylake_cost = {
@@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (0),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-  6,					/* CLEAR_RATIO */
+  17,					/* CLEAR_RATIO */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 134916cc972..eb057a67750 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 	  "prefer_known_rep_movsb_stosb",
-	  m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
-	  | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
new file mode 100644
index 00000000000..970aa741971
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
new file mode 100644
index 00000000000..b6041944630
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+  e_u8 b[4][MAXBC];
+  int i, j;
+
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < BC; j++) a[i][j] = b[i][j];
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
new file mode 100644
index 00000000000..b0dc7484d09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
new file mode 100644
index 00000000000..07c2816910c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
new file mode 100644
index 00000000000..52ea882c814
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
new file mode 100644
index 00000000000..d4db031958f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+  int i, j;
+
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < BC; j++) a[i][j] = 1;
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-04-06 12:41 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-06 12:41 [gcc r11-8006] x86: Update memcpy/memset inline strategies for Skylake family CPUs H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).