public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Erich Elsen <eriche@google.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: "Carlos O'Donell" <carlos@redhat.com>,
	GNU C Library <libc-alpha@sourceware.org>
Subject: Re: memcpy performance regressions 2.19 -> 2.24(5)
Date: Tue, 23 May 2017 03:19:00 -0000	[thread overview]
Message-ID: <CAOVZoAM8iAhH__mtm+HUZb8N6OXk1P=9QKgFeewtmStmXzVSMg@mail.gmail.com> (raw)
In-Reply-To: <CAMe9rOowieroCST=wq81Co+OHud81jyjJZ8OZpqQCmEnnoPZfQ@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]

Here is the patch that slightly refactors how init_cacheinfo is called.

On Mon, May 22, 2017 at 7:24 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, May 22, 2017 at 6:23 PM, Erich Elsen <eriche@google.com> wrote:
>> I definitely think increasing the size in the case of processors with
>> a large number of cores makes sense.  Hopefully with some testing we
>> can confirm it is a net win and/or find a more empirical number.
>>
>> Thanks for that patch with the tunable support.  I've just put a
>> similar patch in review for sharing right now.  It adds support in the
>> case that HAVE_TUNABLES isn't defined like the similar code in arena.c
>>  and also makes a minor change that turns init_cacheinfo into a
>> init_cacheinfo_impl (a hidden callable).  init_cacheinfo is now a
>> constructor that just calls the impl and passes the cpu_features
>> struct.  This is useful in that it makes the code a bit more modular
>> (something that we'll need to be able to test this internally).
>
> This sounds a good idea.  I'd also like to add tunable support in
> init_cpu_features to turn on/off CPU features.   non_temporal_threshold
> will be one of them.
>
>
> --
> H.J.

[-- Attachment #2: 0001-add-tunable-for-non-temporal-store.-slightly-refacto.patch --]
[-- Type: text/x-patch, Size: 7054 bytes --]

From 87b133a3df55e4e444f893a354f01e10e7557ac6 Mon Sep 17 00:00:00 2001
From: Erich Elsen <eriche@google.com>
Date: Mon, 22 May 2017 18:08:58 -0700
Subject: [PATCH 1/2] add tunable for non temporal store. slightly refactor
 cache info code to be allow for the possiblity of calling the implementation.

---
 elf/dl-tunables.list    |  7 ++++
 sysdeps/x86/cacheinfo.c | 95 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index b9f1488798..d19fb0f175 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -30,6 +30,13 @@
 # 	     NONE: Read all the time.
 
 glibc {
+  x86_cache {
+    x86_shared_non_temporal_threshold {
+      type: SIZE_T
+      env_alias: SHARED_NON_TEMPORAL_THRESHOLD
+      security_level: SXID_IGNORE
+    }
+  }
   malloc {
     check {
       type: INT_32
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 1ccbe41b8f..2619c5a83c 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -23,6 +23,15 @@
 #include <cpuid.h>
 #include <init-arch.h>
 
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE x86_cache
+#else
+  #include <string.h>
+  extern char **_environ;
+#endif
+#include <elf/dl-tunables.h>
+
+
 #define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
 #define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
 #define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
@@ -128,7 +137,7 @@ intel_02_known_compare (const void *p1, const void *p2)
 static long int
 __attribute__ ((noinline))
 intel_check_word (int name, unsigned int value, bool *has_level_2,
-		  bool *no_level_2_or_3)
+		  bool *no_level_2_or_3, const struct cpu_features* x86_cpu_features)
 {
   if ((value & 0x80000000) != 0)
     /* The register value is reserved.  */
@@ -206,8 +215,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      /* Intel reused this value.  For family 15, model 6 it
 		 specifies the 3rd level cache.  Otherwise the 2nd
 		 level cache.  */
-	      unsigned int family = GLRO(dl_x86_cpu_features).family;
-	      unsigned int model = GLRO(dl_x86_cpu_features).model;
+	      unsigned int family = x86_cpu_features->family;
+	      unsigned int model = x86_cpu_features->model;
 
 	      if (family == 15 && model == 6)
 		{
@@ -257,7 +266,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 
 
 static long int __attribute__ ((noinline))
-handle_intel (int name, unsigned int maxidx)
+handle_intel (int name, unsigned int maxidx,
+              const struct cpu_features* x86_cpu_features)
 {
   /* Return -1 for older CPUs.  */
   if (maxidx < 2)
@@ -289,19 +299,23 @@ handle_intel (int name, unsigned int maxidx)
 	}
 
       /* Process the individual registers' value.  */
-      result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
 
-      result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
 
-      result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
 
-      result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
     }
@@ -437,7 +451,7 @@ attribute_hidden
 __cache_sysconf (int name)
 {
   if (is_intel)
-    return handle_intel (name, max_cpuid);
+    return handle_intel (name, max_cpuid, &GLRO(dl_x86_cpu_features));
 
   if (is_amd)
     return handle_amd (name);
@@ -475,9 +489,9 @@ int __x86_prefetchw attribute_hidden;
 #endif
 
 
-static void
-__attribute__((constructor))
-init_cacheinfo (void)
+void
+attribute_hidden
+__init_cacheinfo_impl (const struct cpu_features* x86_cpu_features)
 {
   /* Find out what brand of processor.  */
   unsigned int eax;
@@ -492,14 +506,17 @@ init_cacheinfo (void)
 
   if (is_intel)
     {
-      data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+      data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid,
+                           x86_cpu_features);
 
-      long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
+      long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid,
+                                    x86_cpu_features);
       bool inclusive_cache = true;
 
       /* Try L3 first.  */
       level  = 3;
-      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
+      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid,
+                             x86_cpu_features);
 
       /* Number of logical processors sharing L2 cache.  */
       int threads_l2;
@@ -529,8 +546,8 @@ init_cacheinfo (void)
 	     highest cache level.  */
 	  if (max_cpuid >= 4)
 	    {
-	      unsigned int family = GLRO(dl_x86_cpu_features).family;
-	      unsigned int model = GLRO(dl_x86_cpu_features).model;
+				unsigned int family = x86_cpu_features->family;
+				unsigned int model = x86_cpu_features->model;
 
 	      int i = 0;
 
@@ -673,7 +690,7 @@ intel_bug_no_cache_info:
 		 level.  */
 
 	      threads
-		= ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+		= ((x86_cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
 		    >> 16) & 0xff);
 	    }
 
@@ -768,4 +785,46 @@ intel_bug_no_cache_info:
      shared cache size is the approximate value above which non-temporal
      store becomes faster.  */
   __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
+
+#if HAVE_TUNABLES
+  TUNABLE_SET_VAL(x86_shared_non_temporal_threshold,
+                  &__x86_shared_non_temporal_threshold);
+#else
+  if (__glibc_likely (_environ != NULL))
+    {
+      char **runp = _environ;
+      char *envline;
+
+      while (*runp != NULL)
+        {
+          envline = *runp;
+          runp++;
+          size_t len = strcspn (envline, "=");
+
+          if (envline[len] != '=')
+            continue;
+
+          switch (len)
+            {
+            case 29:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline,
+                              "SHARED_NON_TEMPORAL_THRESHOLD", 29) == 0)
+                    __x86_shared_non_temporal_threshold = atoi (&envline[29]);
+                }
+              break;
+            default:
+              break;
+            }
+        }
+    }
+#endif
+}
+
+static void
+__attribute__((constructor))
+init_cacheinfo (void)
+{
+  __init_cacheinfo_impl (&GLRO(dl_x86_cpu_features));
 }
-- 
2.13.0.219.gdb65acc882-goog


  reply	other threads:[~2017-05-23  3:19 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-05-05 17:09 Erich Elsen
2017-05-05 18:09 ` Carlos O'Donell
2017-05-06  0:57   ` Erich Elsen
2017-05-06 15:41     ` H.J. Lu
2017-05-09 23:48       ` Erich Elsen
2017-05-10 17:33         ` H.J. Lu
2017-05-11  2:17           ` Carlos O'Donell
2017-05-12 19:47             ` Erich Elsen
     [not found]             ` <CAOVZoAPp3_T+ourRkNFXHfCSQUOMFn4iBBm9j50==h=VJcGSzw@mail.gmail.com>
2017-05-12 20:21               ` H.J. Lu
2017-05-12 21:21                 ` H.J. Lu
2017-05-18 20:59                   ` Erich Elsen
2017-05-22 19:17                     ` H.J. Lu
2017-05-22 20:22                       ` H.J. Lu
2017-05-23  1:23                       ` Erich Elsen
2017-05-23  2:25                         ` H.J. Lu
2017-05-23  3:19                           ` Erich Elsen [this message]
2017-05-23 20:39                             ` Erich Elsen
2017-05-23 20:46                               ` H.J. Lu
2017-05-23 20:57                                 ` Erich Elsen
2017-05-23 22:08                                   ` H.J. Lu
2017-05-23 22:12                                     ` Erich Elsen
2017-05-23 22:55                                       ` H.J. Lu
2017-05-24  0:56                                         ` Erich Elsen
2017-05-24  3:42                                           ` H.J. Lu
2017-05-24 21:03                                             ` Erich Elsen
2017-05-24 21:36                             ` H.J. Lu
2017-05-25 21:23                               ` Erich Elsen
2017-05-25 21:57                                 ` Erich Elsen
2017-05-25 22:03                                   ` H.J. Lu
2017-05-27  0:31                                     ` Erich Elsen
2017-05-27 21:35                                       ` H.J. Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAOVZoAM8iAhH__mtm+HUZb8N6OXk1P=9QKgFeewtmStmXzVSMg@mail.gmail.com' \
    --to=eriche@google.com \
    --cc=carlos@redhat.com \
    --cc=hjl.tools@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).