public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold
@ 2019-07-26 11:58 Feng Xue OS
  2019-07-26 15:17 ` Szabolcs Nagy
  2019-07-29  3:46 ` Siddhesh Poyarekar
  0 siblings, 2 replies; 13+ messages in thread
From: Feng Xue OS @ 2019-07-26 11:58 UTC (permalink / raw)
  To: libc-alpha; +Cc: Feng Xue OS

This patch is composed to add a tunable 'glibc.memset.dc_zva_threshold'
to control using DC ZVA in memset or not. Only when memset size exceeds
this threshold, DC ZVA will be used.

The background is that DC ZVA does not always outperform normal
memory-store zeroing, especially when there are multiple processes/threads
contending for memory/cache.

Feng
----

    * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
    * sysdeps/aarch64/dl-tunables.list (glibc):
    Add memset.dc_zva_threshold.
    * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
    dl-tunables.h
    (INIT_ZVA_THRESHOLD): New macro, with new local variable
    zva_threshold.
    (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
    * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
    New variable.
    (init_memset): New macro.
    * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
    Add conditional compare over __memset_dc_zva_threshold.
    * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
    new value.
---
 ChangeLog                                 | 18 ++++++++++++++++++
 manual/tunables.texi                      |  9 +++++++++
 sysdeps/aarch64/dl-tunables.list          |  6 ++++++
 sysdeps/aarch64/multiarch/init-arch.h     | 11 +++++++++++
 sysdeps/aarch64/multiarch/memset.c        | 22 +++++++++++++++++++---
 sysdeps/aarch64/multiarch/memset_base64.S |  7 +++++++
 sysdeps/aarch64/multiarch/memset_emag.S   |  4 ++--
 7 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index dbdb85d..1921e2a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2019-07-26  Feng Xue  <fxue@os.amperecomputing.com>
+
+	* manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
+	* sysdeps/aarch64/dl-tunables.list (glibc):
+	Add memset.dc_zva_threshold.
+	* sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
+	dl-tunables.h
+	(INIT_ZVA_THRESHOLD): New macro, with new local variable
+	zva_threshold.
+	(INIT_ARCH): Add INIT_ZVA_THRESHOLD.
+	* sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
+	New variable.
+	(init_memset): New macro.
+	* sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
+	Add conditional compare	over __memset_dc_zva_threshold.
+	* sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
+	new value.
+
 2019-07-25  Florian Weimer  <fweimer@redhat.com>
 
 	[BZ #24677]
diff --git a/manual/tunables.texi b/manual/tunables.texi
index ee0fdf2..c7c13cc 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -411,3 +411,12 @@ instead.
 
 This tunable is specific to i386 and x86-64.
 @end deftp
+
+@deftp Tunable glibc.memset.dc_zva_threshold
+The @code{glibc.memset.dc_zva_threshold} tunable allows the user to set
+threshold to trigger DC ZVA in memset. When memset size is less than this
+threshold, normal memory store instruction will be used, otherwise DC ZVA
+instruction will be used. Value of zero means default threshold.
+
+This tunable is specific to aarch64.
+@end deftp
diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
index 5fac533..0f5b5e1 100644
--- a/sysdeps/aarch64/dl-tunables.list
+++ b/sysdeps/aarch64/dl-tunables.list
@@ -22,4 +22,10 @@ glibc {
       type: STRING
     }
   }
+  memset {
+    dc_zva_threshold {
+      type: SIZE_T
+      default: 0 
+    }
+  }
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index b9020ae..93133a2 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,7 +18,18 @@
 
 #include <ldsodefs.h>
 
+#if HAVE_TUNABLES
+# include <elf/dl-tunables.h>
+
+# define INIT_ZVA_THRESHOLD()						      \
+  uint64_t __attribute__((unused)) zva_threshold =			      \
+    TUNABLE_GET(glibc, memset, dc_zva_threshold, size_t, NULL);
+#else
+# define INIT_ZVA_THRESHOLD() 
+#endif
+
 #define INIT_ARCH()							      \
+  INIT_ZVA_THRESHOLD()							      \
   uint64_t __attribute__((unused)) midr =				      \
     GLRO(dl_aarch64_cpu_features).midr_el1;				      \
   unsigned __attribute__((unused)) zva_size =				      \
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 4817587..2015bce 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -32,12 +32,28 @@ extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
 extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 
+# if HAVE_TUNABLES
+uint64_t __memset_dc_zva_threshold = 512;
+
+#  define init_memset(fn, default_zva_threshold)		\
+({								\
+  if (zva_threshold)						\
+    __memset_dc_zva_threshold = zva_threshold;			\
+  else if (default_zva_threshold)				\
+    __memset_dc_zva_threshold = default_zva_threshold;		\
+  fn;								\
+})
+# else
+#  define init_memset(fn, default_zva_threshold)  (fn)
+# endif
+
 libc_ifunc (__libc_memset,
+
 	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
-	     ? __memset_falkor
+	     ? init_memset (__memset_falkor, 0)
 	     : (IS_EMAG (midr) && zva_size == 64
-	       ? __memset_emag
-	       : __memset_generic)));
+	       ? init_memset (__memset_emag, 8*1024*1024)
+	       : init_memset (__memset_generic, 0))));
 
 # undef memset
 strong_alias (__libc_memset, memset);
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..6350a6d 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -91,7 +91,14 @@ L(set96):
 	.p2align 4
 L(set_long):
 	stp	val, val, [dstin]
+#if HAVE_TUNABLES
+	adrp	tmp1, __memset_dc_zva_threshold
+	add	tmp1, tmp1, :lo12:__memset_dc_zva_threshold
+	ldr	tmp2, [tmp1]	/* Load DC ZVA tunable threshold value. */
+	cmp	count, tmp2
+#else
 	cmp	count, DC_ZVA_THRESHOLD
+#endif
 	ccmp	val, 0, 0, cs
 	bic	dst, dstin, 15
 	b.eq	L(zva_64)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..78a2a14 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -23,10 +23,10 @@
 /*
  * Using dc zva to zero memory does not produce better performance if
  * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
+ * processes/threads contending memory/cache. Here we use a very 
  * large threshold to trigger usage of dc zva.
 */
-# define DC_ZVA_THRESHOLD 1024
+# define DC_ZVA_THRESHOLD (8*1024*1024)
 
 # include "./memset_base64.S"
 #endif
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread
* Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold
@ 2019-08-06 16:18 Wilco Dijkstra
  2019-08-07 14:13 ` Siddhesh Poyarekar
  0 siblings, 1 reply; 13+ messages in thread
From: Wilco Dijkstra @ 2019-08-06 16:18 UTC (permalink / raw)
  To: 'GNU C Library', fxue; +Cc: nd, Siddhesh Poyarekar

Hi Feng,

> I still hope this tuning on dc zva can work for other aarch64 processors.
> Since we focus on emag, and got no other aarch64 machines on hand,
> Then, if someone of other aarch64 is willing to test this, that would be better.

I don't believe this kind of tunable is useful in general. DC ZVA exists because
it gives a speedup - quite significantly so on the latest microarchitectures, but it
improves gcc_r performance as well on older cores like Cortex-A57.

If you find that it doesn't help emag, the best option is to avoid DC ZVA
altogether - this is even faster as you don't have to execute the runtime check.
Or you could use a tunable to select between fixed settings of the DC ZVA.

In fact it might be useful to have a generic tunable which allows one to choose
specific ifuncs, eg. glibc.memset=__memset_no_dczva.

        .p2align 4
 L(set_long):
        stp     val, val, [dstin]
-       cmp     count, DC_ZVA_THRESHOLD
+#ifdef HAVE_DCZVA_THRESHOLD_TUNABLE
+       adrp    tmp1, __dczva_threshold
+       add     tmp1, tmp1, :lo12:__dczva_threshold
+       ldr     tmp2, [tmp1]    /* Load DC ZVA tunable threshold value. */
+       cmp     count, tmp2
+#else
+       cmp     count, DCZVA_THRESHOLD
+#endif

I don't think it makes sense to support both options here. The existing code
is carefully laid out so this undoes the 16-byte alignment of the following loops.

Wilco

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2019-08-13 13:11 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-26 11:58 [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold Feng Xue OS
2019-07-26 15:17 ` Szabolcs Nagy
2019-07-29  2:25   ` Feng Xue OS
2019-07-29  3:46 ` Siddhesh Poyarekar
2019-07-29  3:49   ` Siddhesh Poyarekar
2019-08-02  1:50   ` Feng Xue OS
2019-08-02  3:07     ` Siddhesh Poyarekar
2019-08-02  3:10       ` Siddhesh Poyarekar
2019-08-02  4:31         ` Feng Xue OS
2019-08-06 16:18 Wilco Dijkstra
2019-08-07 14:13 ` Siddhesh Poyarekar
2019-08-08  3:56   ` Feng Xue OS
2019-08-13 13:11     ` Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).