[COMMITTED 2.39 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)

public inbox for libc-stable@sourceware.org
 help / color / mirror / Atom feed

* [COMMITTED 2.39 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
@ 2024-04-04 10:39 Arjun Shankar
  2024-04-04 10:39 ` [COMMITTED 2.39 2/3] x86: Do not prefer ERMS for memset on Zen3+ Arjun Shankar
  2024-04-04 10:39 ` [COMMITTED 2.39 3/3] x86: Expand the comment on when REP STOSB is used on memset Arjun Shankar
  0 siblings, 2 replies; 3+ messages in thread
From: Arjun Shankar @ 2024-04-04 10:39 UTC (permalink / raw)
  To: libc-stable; +Cc: Arjun Shankar, Adhemerval Zanella, H . J . Lu

From: Adhemerval Zanella <adhemerval.zanella@linaro.org>

The REP MOVSB usage on memcpy/memmove does not show much performance
improvement on Zen3/Zen4 cores compared to the vectorized loops.  Also,
as from BZ 30994, if the source is aligned and the destination is not
the performance can be 20x slower.

The performance difference is noticeable with small buffer sizes, closer
to the lower bounds limits when memcpy/memmove starts to use ERMS.  The
performance of REP MOVSB is similar to vectorized instruction on the
size limit (the L2 cache).  Also, there is no drawback to multiple cores
sharing the cache.

Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e)
---
 sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index d5101615e3..f34d12846c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   long int data = -1;
   long int shared = -1;
   long int shared_per_thread = -1;
-  long int core = -1;
   unsigned int threads = 0;
   unsigned long int level1_icache_size = -1;
   unsigned long int level1_icache_linesize = -1;
@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (cpu_features->basic.kind == arch_kind_intel)
     {
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
-      core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
       shared_per_thread = shared;
 
@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
       level1_dcache_linesize
 	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
-      level2_cache_size = core;
+      level2_cache_size
+	= handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
       level2_cache_assoc
 	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
       level2_cache_linesize
@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level4_cache_size
 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
 
-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads,
+			     level2_cache_size);
     }
   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
     {
       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
-      core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
       shared_per_thread = shared;
 
@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level1_dcache_size = data;
       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
-      level2_cache_size = core;
+      level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
       level3_cache_size = shared;
       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
 
-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads,
+			     level2_cache_size);
     }
   else if (cpu_features->basic.kind == arch_kind_amd)
     {
       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
-      core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level1_dcache_size = data;
       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
-      level2_cache_size = core;
+      level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
       level3_cache_size = shared;
@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       if (shared <= 0)
         {
            /* No shared L3 cache.  All we have is the L2 cache.  */
-           shared = core;
+           shared = level2_cache_size;
         }
       else if (cpu_features->basic.family < 0x17)
         {
            /* Account for exclusive L2 and L3 caches.  */
-           shared += core;
+           shared += level2_cache_size;
         }
 
       shared_per_thread = shared;
@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
+   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+      cases slower than the vectorized path (and for some alignments,
+      it is really slow, check BZ #30994).  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_threshold = non_temporal_threshold;
+
   /* The default threshold to use Enhanced REP STOSB.  */
   unsigned long int rep_stosb_threshold = 2048;
 
@@ -1028,16 +1033,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 			   SIZE_MAX);
 
   unsigned long int rep_movsb_stop_threshold;
-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
-     performing poorly for data above L2 cache size. Henceforth, adding
-     an upper bound threshold parameter to limit the usage of Enhanced
-     REP MOVSB operations and setting its value to L2 cache size.  */
-  if (cpu_features->basic.kind == arch_kind_amd)
-    rep_movsb_stop_threshold = core;
   /* Setting the upper bound of ERMS to the computed value of
-     non-temporal threshold for architectures other than AMD.  */
-  else
-    rep_movsb_stop_threshold = non_temporal_threshold;
+     non-temporal threshold for all architectures.  */
+  rep_movsb_stop_threshold = non_temporal_threshold;
 
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;
-- 
2.43.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [COMMITTED 2.39 2/3] x86: Do not prefer ERMS for memset on Zen3+
  2024-04-04 10:39 [COMMITTED 2.39 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) Arjun Shankar
@ 2024-04-04 10:39 ` Arjun Shankar
  2024-04-04 10:39 ` [COMMITTED 2.39 3/3] x86: Expand the comment on when REP STOSB is used on memset Arjun Shankar
  1 sibling, 0 replies; 3+ messages in thread
From: Arjun Shankar @ 2024-04-04 10:39 UTC (permalink / raw)
  To: libc-stable; +Cc: Arjun Shankar, Adhemerval Zanella, H . J . Lu

From: Adhemerval Zanella <adhemerval.zanella@linaro.org>

For AMD Zen3+ architecture, the performance of the vectorized loop is
slightly better than ERMS.

Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit 272708884cb750f12f5c74a00e6620c19dc6d567)
---
 sysdeps/x86/dl-cacheinfo.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f34d12846c..5a98f70364 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
      minimum value is fixed.  */
   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
 				     long int, NULL);
+  if (cpu_features->basic.kind == arch_kind_amd
+      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
+    /* For AMD Zen3+ architecture, the performance of the vectorized loop is
+       slightly better than ERMS.  */
+    rep_stosb_threshold = SIZE_MAX;
 
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
-- 
2.43.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [COMMITTED 2.39 3/3] x86: Expand the comment on when REP STOSB is used on memset
  2024-04-04 10:39 [COMMITTED 2.39 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) Arjun Shankar
  2024-04-04 10:39 ` [COMMITTED 2.39 2/3] x86: Do not prefer ERMS for memset on Zen3+ Arjun Shankar
@ 2024-04-04 10:39 ` Arjun Shankar
  1 sibling, 0 replies; 3+ messages in thread
From: Arjun Shankar @ 2024-04-04 10:39 UTC (permalink / raw)
  To: libc-stable; +Cc: Arjun Shankar, Adhemerval Zanella, H . J . Lu

From: Adhemerval Zanella <adhemerval.zanella@linaro.org>

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 491e55beab7457ed310a4a47496f4a333c5d1032)
---
 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9984c3ca0f..97839a2248 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,7 +21,9 @@
    2. If size is less than VEC, use integer register stores.
    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
-   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+   5. On machines ERMS feature, if size is greater or equal than
+      __x86_rep_stosb_threshold then REP STOSB will be used.
+   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
       4 VEC stores and store 4 * VEC at a time until done.  */
 
 #include <sysdep.h>
-- 
2.43.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-04-04 10:40 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-04 10:39 [COMMITTED 2.39 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) Arjun Shankar
2024-04-04 10:39 ` [COMMITTED 2.39 2/3] x86: Do not prefer ERMS for memset on Zen3+ Arjun Shankar
2024-04-04 10:39 ` [COMMITTED 2.39 3/3] x86: Expand the comment on when REP STOSB is used on memset Arjun Shankar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).