From: Erich Elsen <eriche@google.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: "Carlos O'Donell" <carlos@redhat.com>,
GNU C Library <libc-alpha@sourceware.org>
Subject: Re: memcpy performance regressions 2.19 -> 2.24(5)
Date: Tue, 23 May 2017 03:19:00 -0000 [thread overview]
Message-ID: <CAOVZoAM8iAhH__mtm+HUZb8N6OXk1P=9QKgFeewtmStmXzVSMg@mail.gmail.com> (raw)
In-Reply-To: <CAMe9rOowieroCST=wq81Co+OHud81jyjJZ8OZpqQCmEnnoPZfQ@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]
Here is the patch that slightly refactors how init_cacheinfo is called.
On Mon, May 22, 2017 at 7:24 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, May 22, 2017 at 6:23 PM, Erich Elsen <eriche@google.com> wrote:
>> I definitely think increasing the size in the case of processors with
>> a large number of cores makes sense. Hopefully with some testing we
>> can confirm it is a net win and/or find a more empirical number.
>>
>> Thanks for that patch with the tunable support. I've just put a
>> similar patch in review for sharing right now. It adds support in the
>> case that HAVE_TUNABLES isn't defined like the similar code in arena.c
>> and also makes a minor change that turns init_cacheinfo into a
>> init_cacheinfo_impl (a hidden callable). init_cacheinfo is now a
>> constructor that just calls the impl and passes the cpu_features
>> struct. This is useful in that it makes the code a bit more modular
>> (something that we'll need to be able to test this internally).
>
> This sounds a good idea. I'd also like to add tunable support in
> init_cpu_features to turn on/off CPU features. non_temporal_threshold
> will be one of them.
>
>
> --
> H.J.
[-- Attachment #2: 0001-add-tunable-for-non-temporal-store.-slightly-refacto.patch --]
[-- Type: text/x-patch, Size: 7054 bytes --]
From 87b133a3df55e4e444f893a354f01e10e7557ac6 Mon Sep 17 00:00:00 2001
From: Erich Elsen <eriche@google.com>
Date: Mon, 22 May 2017 18:08:58 -0700
Subject: [PATCH 1/2] add tunable for non temporal store. slightly refactor
cache info code to be allow for the possiblity of calling the implementation.
---
elf/dl-tunables.list | 7 ++++
sysdeps/x86/cacheinfo.c | 95 +++++++++++++++++++++++++++++++++++++++----------
2 files changed, 84 insertions(+), 18 deletions(-)
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index b9f1488798..d19fb0f175 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -30,6 +30,13 @@
# NONE: Read all the time.
glibc {
+ x86_cache {
+ x86_shared_non_temporal_threshold {
+ type: SIZE_T
+ env_alias: SHARED_NON_TEMPORAL_THRESHOLD
+ security_level: SXID_IGNORE
+ }
+ }
malloc {
check {
type: INT_32
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 1ccbe41b8f..2619c5a83c 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -23,6 +23,15 @@
#include <cpuid.h>
#include <init-arch.h>
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE x86_cache
+#else
+ #include <string.h>
+ extern char **_environ;
+#endif
+#include <elf/dl-tunables.h>
+
+
#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
@@ -128,7 +137,7 @@ intel_02_known_compare (const void *p1, const void *p2)
static long int
__attribute__ ((noinline))
intel_check_word (int name, unsigned int value, bool *has_level_2,
- bool *no_level_2_or_3)
+ bool *no_level_2_or_3, const struct cpu_features* x86_cpu_features)
{
if ((value & 0x80000000) != 0)
/* The register value is reserved. */
@@ -206,8 +215,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
/* Intel reused this value. For family 15, model 6 it
specifies the 3rd level cache. Otherwise the 2nd
level cache. */
- unsigned int family = GLRO(dl_x86_cpu_features).family;
- unsigned int model = GLRO(dl_x86_cpu_features).model;
+ unsigned int family = x86_cpu_features->family;
+ unsigned int model = x86_cpu_features->model;
if (family == 15 && model == 6)
{
@@ -257,7 +266,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
static long int __attribute__ ((noinline))
-handle_intel (int name, unsigned int maxidx)
+handle_intel (int name, unsigned int maxidx,
+ const struct cpu_features* x86_cpu_features)
{
/* Return -1 for older CPUs. */
if (maxidx < 2)
@@ -289,19 +299,23 @@ handle_intel (int name, unsigned int maxidx)
}
/* Process the individual registers' value. */
- result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
- result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
- result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
- result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
}
@@ -437,7 +451,7 @@ attribute_hidden
__cache_sysconf (int name)
{
if (is_intel)
- return handle_intel (name, max_cpuid);
+ return handle_intel (name, max_cpuid, &GLRO(dl_x86_cpu_features));
if (is_amd)
return handle_amd (name);
@@ -475,9 +489,9 @@ int __x86_prefetchw attribute_hidden;
#endif
-static void
-__attribute__((constructor))
-init_cacheinfo (void)
+void
+attribute_hidden
+__init_cacheinfo_impl (const struct cpu_features* x86_cpu_features)
{
/* Find out what brand of processor. */
unsigned int eax;
@@ -492,14 +506,17 @@ init_cacheinfo (void)
if (is_intel)
{
- data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+ data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid,
+ x86_cpu_features);
- long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
+ long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid,
+ x86_cpu_features);
bool inclusive_cache = true;
/* Try L3 first. */
level = 3;
- shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
+ shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid,
+ x86_cpu_features);
/* Number of logical processors sharing L2 cache. */
int threads_l2;
@@ -529,8 +546,8 @@ init_cacheinfo (void)
highest cache level. */
if (max_cpuid >= 4)
{
- unsigned int family = GLRO(dl_x86_cpu_features).family;
- unsigned int model = GLRO(dl_x86_cpu_features).model;
+ unsigned int family = x86_cpu_features->family;
+ unsigned int model = x86_cpu_features->model;
int i = 0;
@@ -673,7 +690,7 @@ intel_bug_no_cache_info:
level. */
threads
- = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+ = ((x86_cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
>> 16) & 0xff);
}
@@ -768,4 +785,46 @@ intel_bug_no_cache_info:
shared cache size is the approximate value above which non-temporal
store becomes faster. */
__x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
+
+#if HAVE_TUNABLES
+ TUNABLE_SET_VAL(x86_shared_non_temporal_threshold,
+ &__x86_shared_non_temporal_threshold);
+#else
+ if (__glibc_likely (_environ != NULL))
+ {
+ char **runp = _environ;
+ char *envline;
+
+ while (*runp != NULL)
+ {
+ envline = *runp;
+ runp++;
+ size_t len = strcspn (envline, "=");
+
+ if (envline[len] != '=')
+ continue;
+
+ switch (len)
+ {
+ case 29:
+ if (!__builtin_expect (__libc_enable_secure, 0))
+ {
+ if (memcmp (envline,
+ "SHARED_NON_TEMPORAL_THRESHOLD", 29) == 0)
+ __x86_shared_non_temporal_threshold = atoi (&envline[29]);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+#endif
+}
+
+static void
+__attribute__((constructor))
+init_cacheinfo (void)
+{
+ __init_cacheinfo_impl (&GLRO(dl_x86_cpu_features));
}
--
2.13.0.219.gdb65acc882-goog
next prev parent reply other threads:[~2017-05-23 3:19 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-05-05 17:09 Erich Elsen
2017-05-05 18:09 ` Carlos O'Donell
2017-05-06 0:57 ` Erich Elsen
2017-05-06 15:41 ` H.J. Lu
2017-05-09 23:48 ` Erich Elsen
2017-05-10 17:33 ` H.J. Lu
2017-05-11 2:17 ` Carlos O'Donell
2017-05-12 19:47 ` Erich Elsen
[not found] ` <CAOVZoAPp3_T+ourRkNFXHfCSQUOMFn4iBBm9j50==h=VJcGSzw@mail.gmail.com>
2017-05-12 20:21 ` H.J. Lu
2017-05-12 21:21 ` H.J. Lu
2017-05-18 20:59 ` Erich Elsen
2017-05-22 19:17 ` H.J. Lu
2017-05-22 20:22 ` H.J. Lu
2017-05-23 1:23 ` Erich Elsen
2017-05-23 2:25 ` H.J. Lu
2017-05-23 3:19 ` Erich Elsen [this message]
2017-05-23 20:39 ` Erich Elsen
2017-05-23 20:46 ` H.J. Lu
2017-05-23 20:57 ` Erich Elsen
2017-05-23 22:08 ` H.J. Lu
2017-05-23 22:12 ` Erich Elsen
2017-05-23 22:55 ` H.J. Lu
2017-05-24 0:56 ` Erich Elsen
2017-05-24 3:42 ` H.J. Lu
2017-05-24 21:03 ` Erich Elsen
2017-05-24 21:36 ` H.J. Lu
2017-05-25 21:23 ` Erich Elsen
2017-05-25 21:57 ` Erich Elsen
2017-05-25 22:03 ` H.J. Lu
2017-05-27 0:31 ` Erich Elsen
2017-05-27 21:35 ` H.J. Lu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAOVZoAM8iAhH__mtm+HUZb8N6OXk1P=9QKgFeewtmStmXzVSMg@mail.gmail.com' \
--to=eriche@google.com \
--cc=carlos@redhat.com \
--cc=hjl.tools@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).