public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
@ 2022-04-14  4:12 Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 2/6] elf: Add tests for the hash functions in dl-hash.h Noah Goldstein
                   ` (17 more replies)
  0 siblings, 18 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14  4:12 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving it to
sysdeps/generic/dl-hash.h. Changed name so its now in the
reserved namespace.
---
 sysdeps/generic/dl-hash.h   | 13 +++++++++++++
 sysdeps/i386/i686/dl-hash.h |  3 +++
 2 files changed, 16 insertions(+)

diff --git a/sysdeps/generic/dl-hash.h b/sysdeps/generic/dl-hash.h
index 9bc7e3bd67..c041074352 100644
--- a/sysdeps/generic/dl-hash.h
+++ b/sysdeps/generic/dl-hash.h
@@ -19,7 +19,9 @@
 #ifndef _DL_HASH_H
 #define _DL_HASH_H	1
 
+#include <stdint.h>
 
+#ifndef __HAS_DL_ELF_HASH
 /* This is the hashing function specified by the ELF ABI.  In the
    first five operations no overflow is possible so we optimized it a
    bit.  */
@@ -71,5 +73,16 @@ _dl_elf_hash (const char *name_arg)
     }
   return hash;
 }
+#endif /* !__HAS_DL_ELF_HASH */
+
+static uint32_t
+__dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
 
 #endif /* dl-hash.h */
diff --git a/sysdeps/i386/i686/dl-hash.h b/sysdeps/i386/i686/dl-hash.h
index c124480e77..d18370350d 100644
--- a/sysdeps/i386/i686/dl-hash.h
+++ b/sysdeps/i386/i686/dl-hash.h
@@ -75,4 +75,7 @@ _dl_elf_hash (const char *name)
   return result;
 }
 
+#define __HAS_DL_ELF_HASH	1
+#include <sysdeps/generic/dl-hash.h>
+
 #endif /* dl-hash.h */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v1 2/6] elf: Add tests for the hash functions in dl-hash.h
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-04-14  4:12 ` Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                   ` (16 subsequent siblings)
  17 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14  4:12 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index c96924e9c2..9c76673a40 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..10b0cef045
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) %s(%lu), %x != %x\n", fill, name, len, expec,
+	      res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &__dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%lu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &__dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v1 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 2/6] elf: Add tests for the hash functions in dl-hash.h Noah Goldstein
@ 2022-04-14  4:12 ` Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                   ` (15 subsequent siblings)
  17 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14  4:12 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..9612eec909
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%lu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v1 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 2/6] elf: Add tests for the hash functions in dl-hash.h Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-14  4:12 ` Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
                   ` (14 subsequent siblings)
  17 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14  4:12 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..aa508a6c4f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -230,6 +230,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -238,7 +244,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -357,9 +363,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..b293ee1d1b
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2013-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..7030a13bae
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2013-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) __dl_new_hash (x)
+#define TEST_NAME "__dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..e2e6895afd
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2013-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..5ed0d3e6bf
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2013-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v1 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-04-14  4:12 ` [PATCH v1 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-04-14  4:12 ` Noah Goldstein
  2022-04-14  4:12 ` [PATCH v1 6/6] elf: Optimize __dl_new_hash in dl-hash.h Noah Goldstein
                   ` (13 subsequent siblings)
  17 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14  4:12 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v1 6/6] elf: Optimize __dl_new_hash in dl-hash.h
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-04-14  4:12 ` [PATCH v1 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-04-14  4:12 ` Noah Goldstein
  2022-04-14  4:32 ` [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked H.J. Lu
                   ` (12 subsequent siblings)
  17 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14  4:12 UTC (permalink / raw)
  To: libc-alpha

Unroll slightly so some of the multiples can be pipelined on out-order
machines. Unrolling further started to induce slowdowns for sizes
[0, 4] but can help the loop so if larger sizes are the target
further unrolling can be beneficial.

Results for __dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522
---
 sysdeps/generic/dl-hash.h | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/sysdeps/generic/dl-hash.h b/sysdeps/generic/dl-hash.h
index c041074352..425ab0876e 100644
--- a/sysdeps/generic/dl-hash.h
+++ b/sysdeps/generic/dl-hash.h
@@ -21,6 +21,9 @@
 
 #include <stdint.h>
 
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
 #ifndef __HAS_DL_ELF_HASH
 /* This is the hashing function specified by the ELF ABI.  In the
    first five operations no overflow is possible so we optimized it a
@@ -76,12 +79,29 @@ _dl_elf_hash (const char *name_arg)
 #endif /* !__HAS_DL_ELF_HASH */
 
 static uint32_t
+__attribute__ ((unused))
 __dl_new_hash (const char *s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  unsigned int h = 5381;
+  unsigned char c0, c1;
+  for (;;)
+    {
+      c0 = *s;
+      /* Unlikely length zero string so evens will be slightly less
+         common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = *(s + 1);
+      if (c1 == 0)
+	{
+	  return h * 33 + c0;
+	}
+      h = 33 * 33 * h + 33 * c0 + c1;
+      s += 2;
+    }
 }
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-04-14  4:12 ` [PATCH v1 6/6] elf: Optimize __dl_new_hash in dl-hash.h Noah Goldstein
@ 2022-04-14  4:32 ` H.J. Lu
  2022-04-14 14:56   ` Noah Goldstein
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
                   ` (11 subsequent siblings)
  17 siblings, 1 reply; 167+ messages in thread
From: H.J. Lu @ 2022-04-14  4:32 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Apr 13, 2022 at 9:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No change to the code other than moving it to
> sysdeps/generic/dl-hash.h. Changed name so its now in the
> reserved namespace.
> ---
>  sysdeps/generic/dl-hash.h   | 13 +++++++++++++
>  sysdeps/i386/i686/dl-hash.h |  3 +++
>  2 files changed, 16 insertions(+)
>
> diff --git a/sysdeps/generic/dl-hash.h b/sysdeps/generic/dl-hash.h
> index 9bc7e3bd67..c041074352 100644
> --- a/sysdeps/generic/dl-hash.h
> +++ b/sysdeps/generic/dl-hash.h
> @@ -19,7 +19,9 @@
>  #ifndef _DL_HASH_H
>  #define _DL_HASH_H     1
>
> +#include <stdint.h>
>
> +#ifndef __HAS_DL_ELF_HASH
>  /* This is the hashing function specified by the ELF ABI.  In the
>     first five operations no overflow is possible so we optimized it a
>     bit.  */
> @@ -71,5 +73,16 @@ _dl_elf_hash (const char *name_arg)
>      }
>    return hash;
>  }
> +#endif /* !__HAS_DL_ELF_HASH */
> +
> +static uint32_t
> +__dl_new_hash (const char *s)

I think this should be put in a new header file, dl-new-hash.h, and rename
the function to _dl_new_hash.

> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
>
>  #endif /* dl-hash.h */
> diff --git a/sysdeps/i386/i686/dl-hash.h b/sysdeps/i386/i686/dl-hash.h
> index c124480e77..d18370350d 100644
> --- a/sysdeps/i386/i686/dl-hash.h
> +++ b/sysdeps/i386/i686/dl-hash.h
> @@ -75,4 +75,7 @@ _dl_elf_hash (const char *name)
>    return result;
>  }
>
> +#define __HAS_DL_ELF_HASH      1
> +#include <sysdeps/generic/dl-hash.h>
> +
>  #endif /* dl-hash.h */
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v2 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-04-14  4:32 ` [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked H.J. Lu
@ 2022-04-14 14:55 ` Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (4 more replies)
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (10 subsequent siblings)
  17 siblings, 5 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:55 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..52eef4e417
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,34 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static uint32_t
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
@ 2022-04-14 14:55   ` Noah Goldstein
  2022-04-25 15:39     ` Florian Weimer
  2022-04-14 14:55   ` [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:55 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index c96924e9c2..9c76673a40 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..dd1524abea
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) %s(%lu), %x != %x\n", fill, name, len, expec,
+	      res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%lu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-04-14 14:55   ` Noah Goldstein
  2022-04-25 15:38     ` Florian Weimer
  2022-04-14 14:55   ` [PATCH v2 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:55 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..9612eec909
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%lu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v2 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-14 14:55   ` Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:55 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..aa508a6c4f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -230,6 +230,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -238,7 +244,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -357,9 +363,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v2 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-04-14 14:55   ` [PATCH v2 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-04-14 14:55   ` Noah Goldstein
  2022-04-14 14:55   ` [PATCH v2 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:55 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v2 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-04-14 14:55   ` [PATCH v2 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-04-14 14:55   ` Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:55 UTC (permalink / raw)
  To: libc-alpha

Unroll slightly so some of the multiples can be pipelined on out-order
machines. Unrolling further started to induce slowdowns for sizes
[0, 4] but can help the loop so if larger sizes are the target
further unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522
---
 elf/dl-new-hash.h | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 52eef4e417..b0026706bd 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,14 +20,33 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static uint32_t
+__attribute__ ((unused))
 _dl_new_hash (const char *s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  unsigned int h = 5381;
+  unsigned char c0, c1;
+  for (;;)
+    {
+      c0 = *s;
+      /* Unlikely length zero string so evens will be slightly less
+         common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = *(s + 1);
+      if (c1 == 0)
+	{
+	  return h * 33 + c0;
+	}
+      h = 33 * 33 * h + 33 * c0 + c1;
+      s += 2;
+    }
 }
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:32 ` [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked H.J. Lu
@ 2022-04-14 14:56   ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-14 14:56 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Wed, Apr 13, 2022 at 11:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Apr 13, 2022 at 9:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No change to the code other than moving it to
> > sysdeps/generic/dl-hash.h. Changed name so its now in the
> > reserved namespace.
> > ---
> >  sysdeps/generic/dl-hash.h   | 13 +++++++++++++
> >  sysdeps/i386/i686/dl-hash.h |  3 +++
> >  2 files changed, 16 insertions(+)
> >
> > diff --git a/sysdeps/generic/dl-hash.h b/sysdeps/generic/dl-hash.h
> > index 9bc7e3bd67..c041074352 100644
> > --- a/sysdeps/generic/dl-hash.h
> > +++ b/sysdeps/generic/dl-hash.h
> > @@ -19,7 +19,9 @@
> >  #ifndef _DL_HASH_H
> >  #define _DL_HASH_H     1
> >
> > +#include <stdint.h>
> >
> > +#ifndef __HAS_DL_ELF_HASH
> >  /* This is the hashing function specified by the ELF ABI.  In the
> >     first five operations no overflow is possible so we optimized it a
> >     bit.  */
> > @@ -71,5 +73,16 @@ _dl_elf_hash (const char *name_arg)
> >      }
> >    return hash;
> >  }
> > +#endif /* !__HAS_DL_ELF_HASH */
> > +
> > +static uint32_t
> > +__dl_new_hash (const char *s)
>
> I think this should be put in a new header file, dl-new-hash.h, and rename
> the function to _dl_new_hash.

Fixed in v2.

>
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
> > +
> >
> >  #endif /* dl-hash.h */
> > diff --git a/sysdeps/i386/i686/dl-hash.h b/sysdeps/i386/i686/dl-hash.h
> > index c124480e77..d18370350d 100644
> > --- a/sysdeps/i386/i686/dl-hash.h
> > +++ b/sysdeps/i386/i686/dl-hash.h
> > @@ -75,4 +75,7 @@ _dl_elf_hash (const char *name)
> >    return result;
> >  }
> >
> > +#define __HAS_DL_ELF_HASH      1
> > +#include <sysdeps/generic/dl-hash.h>
> > +
> >  #endif /* dl-hash.h */
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-14 14:55   ` [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-25 15:38     ` Florian Weimer
  2022-04-25 15:58       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Florian Weimer @ 2022-04-25 15:38 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha

* Noah Goldstein via Libc-alpha:

> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  uint32_t expec, res;
> +  char buf[len];
> +  memset (buf, fill, len);
> +
> +  expec = simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    {
> +      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
> +      return 1;
> +    }
> +
> +  return 0;
> +}

%lu needs to be %zu, otherwise this fails to build on various
architectures.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-04-14 14:55   ` [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-04-25 15:39     ` Florian Weimer
  2022-04-25 15:59       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Florian Weimer @ 2022-04-25 15:39 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha

* Noah Goldstein via Libc-alpha:

> +static int
> +do_fill_test (size_t len, int fill, const char *name, hash_f testf,
> +	      hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  char buf[len + 1];
> +  memset (buf, fill, len);
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    {
> +      printf ("FAIL: fill(%d) %s(%lu), %x != %x\n", fill, name, len, expec,
> +	      res);
> +      return 1;
> +    }
> +
> +  return 0;
> +}

Same issue with %lu and %zu here.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
@ 2022-04-25 15:58 ` Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (5 more replies)
  2022-04-25 15:59 ` [PATCH v1 " Adhemerval Zanella
                   ` (9 subsequent siblings)
  17 siblings, 6 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..52eef4e417
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,34 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static uint32_t
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-04-25 15:58   ` Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index 8ed6c3b0b1..493409715e 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..7cbc14b46d
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+	      res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-04-25 15:58   ` Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..a1f42e3fbc
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-25 15:58   ` Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..aa508a6c4f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -230,6 +230,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -238,7 +244,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -357,9 +363,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-04-25 15:58   ` [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-04-25 15:58   ` Noah Goldstein
  2022-04-25 15:58   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-04-25 16:01   ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Adhemerval Zanella
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-04-25 15:58   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-04-25 15:58   ` Noah Goldstein
  2022-04-25 16:01   ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Adhemerval Zanella
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: libc-alpha

Unroll slightly so some of the multiples can be pipelined on out-order
machines. Unrolling further started to induce slowdowns for sizes
[0, 4] but can help the loop so if larger sizes are the target
further unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522
---
 elf/dl-new-hash.h | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 52eef4e417..b0026706bd 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,14 +20,33 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static uint32_t
+__attribute__ ((unused))
 _dl_new_hash (const char *s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  unsigned int h = 5381;
+  unsigned char c0, c1;
+  for (;;)
+    {
+      c0 = *s;
+      /* Unlikely length zero string so evens will be slightly less
+         common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = *(s + 1);
+      if (c1 == 0)
+	{
+	  return h * 33 + c0;
+	}
+      h = 33 * 33 * h + 33 * c0 + c1;
+      s += 2;
+    }
 }
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-25 15:38     ` Florian Weimer
@ 2022-04-25 15:58       ` Noah Goldstein
  2022-04-26  8:35         ` Florian Weimer
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:58 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha

On Mon, Apr 25, 2022 at 10:38 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > +static int
> > +do_fill_tests (size_t len, int fill)
> > +{
> > +  uint32_t expec, res;
> > +  char buf[len];
> > +  memset (buf, fill, len);
> > +
> > +  expec = simple_nss_hash (buf, len);
> > +  res = __nss_hash (buf, len);
> > +  if (expec != res)
> > +    {
> > +      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
> > +      return 1;
> > +    }
> > +
> > +  return 0;
> > +}
>
> %lu needs to be %zu, otherwise this fails to build on various
> architectures.

Fixed in v3.
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-04-25 15:39     ` Florian Weimer
@ 2022-04-25 15:59       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 15:59 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha

On Mon, Apr 25, 2022 at 10:39 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > +static int
> > +do_fill_test (size_t len, int fill, const char *name, hash_f testf,
> > +           hash_f expecf)
> > +{
> > +  uint32_t expec, res;
> > +  char buf[len + 1];
> > +  memset (buf, fill, len);
> > +  buf[len] = '\0';
> > +
> > +  expec = expecf (buf);
> > +  res = testf (buf);
> > +  if (expec != res)
> > +    {
> > +      printf ("FAIL: fill(%d) %s(%lu), %x != %x\n", fill, name, len, expec,
> > +           res);
> > +      return 1;
> > +    }
> > +
> > +  return 0;
> > +}
>
> Same issue with %lu and %zu here.

Fixed in v3.
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (7 preceding siblings ...)
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-04-25 15:59 ` Adhemerval Zanella
  2022-04-25 16:16   ` Noah Goldstein
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
                   ` (8 subsequent siblings)
  17 siblings, 1 reply; 167+ messages in thread
From: Adhemerval Zanella @ 2022-04-25 15:59 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha



On 14/04/2022 01:12, Noah Goldstein via Libc-alpha wrote:
> No change to the code other than moving it to
> sysdeps/generic/dl-hash.h. Changed name so its now in the
> reserved namespace.
> ---
>  sysdeps/generic/dl-hash.h   | 13 +++++++++++++
>  sysdeps/i386/i686/dl-hash.h |  3 +++
>  2 files changed, 16 insertions(+)
> 
> diff --git a/sysdeps/generic/dl-hash.h b/sysdeps/generic/dl-hash.h
> index 9bc7e3bd67..c041074352 100644
> --- a/sysdeps/generic/dl-hash.h
> +++ b/sysdeps/generic/dl-hash.h
> @@ -19,7 +19,9 @@
>  #ifndef _DL_HASH_H
>  #define _DL_HASH_H	1
>  
> +#include <stdint.h>
>  
> +#ifndef __HAS_DL_ELF_HASH
>  /* This is the hashing function specified by the ELF ABI.  In the
>     first five operations no overflow is possible so we optimized it a
>     bit.  */
> @@ -71,5 +73,16 @@ _dl_elf_hash (const char *name_arg)
>      }
>    return hash;
>  }
> +#endif /* !__HAS_DL_ELF_HASH */
> +
> +static uint32_t
> +__dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
>  
>  #endif /* dl-hash.h */

Since you refactoring it, it would be better to remove the elf/dl-lookup.c
version.

> diff --git a/sysdeps/i386/i686/dl-hash.h b/sysdeps/i386/i686/dl-hash.h
> index c124480e77..d18370350d 100644
> --- a/sysdeps/i386/i686/dl-hash.h
> +++ b/ 
> @@ -75,4 +75,7 @@ _dl_elf_hash (const char *name)
>    return result;
>  }
>  
> +#define __HAS_DL_ELF_HASH	1
> +#include <sysdeps/generic/dl-hash.h>
> +
>  #endif /* dl-hash.h */

Do we still need this file? The comments seems quite outdated and I think
it would be better to just remove it.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-04-25 15:58   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-04-25 16:01   ` Adhemerval Zanella
  2022-04-25 16:18     ` Noah Goldstein
  5 siblings, 1 reply; 167+ messages in thread
From: Adhemerval Zanella @ 2022-04-25 16:01 UTC (permalink / raw)
  To: libc-alpha



On 25/04/2022 12:58, Noah Goldstein via Libc-alpha wrote:
> +#include <stdint.h>
> +
> +static uint32_t
> +_dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
> +
> +#endif /* dl-new-hash.h */

If you use a static inline you don't need to use a namespace reversed name.
And I think it does make sense to use inline here.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-25 15:59 ` [PATCH v1 " Adhemerval Zanella
@ 2022-04-25 16:16   ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:16 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: GNU C Library

On Mon, Apr 25, 2022 at 10:59 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 14/04/2022 01:12, Noah Goldstein via Libc-alpha wrote:
> > No change to the code other than moving it to
> > sysdeps/generic/dl-hash.h. Changed name so its now in the
> > reserved namespace.
> > ---
> >  sysdeps/generic/dl-hash.h   | 13 +++++++++++++
> >  sysdeps/i386/i686/dl-hash.h |  3 +++
> >  2 files changed, 16 insertions(+)
> >
> > diff --git a/sysdeps/generic/dl-hash.h b/sysdeps/generic/dl-hash.h
> > index 9bc7e3bd67..c041074352 100644
> > --- a/sysdeps/generic/dl-hash.h
> > +++ b/sysdeps/generic/dl-hash.h
> > @@ -19,7 +19,9 @@
> >  #ifndef _DL_HASH_H
> >  #define _DL_HASH_H   1
> >
> > +#include <stdint.h>
> >
> > +#ifndef __HAS_DL_ELF_HASH
> >  /* This is the hashing function specified by the ELF ABI.  In the
> >     first five operations no overflow is possible so we optimized it a
> >     bit.  */
> > @@ -71,5 +73,16 @@ _dl_elf_hash (const char *name_arg)
> >      }
> >    return hash;
> >  }
> > +#endif /* !__HAS_DL_ELF_HASH */
> > +
> > +static uint32_t
> > +__dl_new_hash (const char *s)
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
> > +
> >
> >  #endif /* dl-hash.h */
>
> Since you refactoring it, it would be better to remove the elf/dl-lookup.c
> version.

V2 removes the impl in elf/dl-lookup.c
>
> > diff --git a/sysdeps/i386/i686/dl-hash.h b/sysdeps/i386/i686/dl-hash.h
> > index c124480e77..d18370350d 100644
> > --- a/sysdeps/i386/i686/dl-hash.h
> > +++ b/
> > @@ -75,4 +75,7 @@ _dl_elf_hash (const char *name)
> >    return result;
> >  }
> >
> > +#define __HAS_DL_ELF_HASH    1
> > +#include <sysdeps/generic/dl-hash.h>
> > +
> >  #endif /* dl-hash.h */
>
> Do we still need this file? The comments seems quite outdated and I think
> it would be better to just remove it

Not really sure. The commit no longer touches it as of V2.

The patchset ultimately doesn't update `_dl_elf_hash` (there appears to
be room for improvement; I just couldn't get a version that had no regression
for any size) so would prefer to leave it as a separate issue.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-25 16:01   ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Adhemerval Zanella
@ 2022-04-25 16:18     ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:18 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: GNU C Library

On Mon, Apr 25, 2022 at 11:05 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 25/04/2022 12:58, Noah Goldstein via Libc-alpha wrote:
> > +#include <stdint.h>
> > +
> > +static uint32_t
> > +_dl_new_hash (const char *s)
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
> > +
> > +
> > +#endif /* dl-new-hash.h */
>
> If you use a static inline you don't need to use a namespace reversed name.
> And I think it does make sense to use inline here.

Will make inline in V4. Generally in favor of keeping in reserved namespace
just as a practice and so it matches `_dl_elf_hash`.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (8 preceding siblings ...)
  2022-04-25 15:59 ` [PATCH v1 " Adhemerval Zanella
@ 2022-04-25 16:35 ` Noah Goldstein
  2022-04-25 16:35   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (4 more replies)
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (7 subsequent siblings)
  17 siblings, 5 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:35 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..52eef4e417
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,34 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static uint32_t
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
@ 2022-04-25 16:35   ` Noah Goldstein
  2022-04-25 16:35   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:35 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index 8ed6c3b0b1..493409715e 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..7cbc14b46d
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+	      res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
  2022-04-25 16:35   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-04-25 16:35   ` Noah Goldstein
  2022-04-27 10:39     ` Florian Weimer
  2022-04-25 16:35   ` [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:35 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..a1f42e3fbc
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
  2022-04-25 16:35   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-04-25 16:35   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-25 16:35   ` Noah Goldstein
  2022-04-25 16:36   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
  2022-04-25 16:36   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:35 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..aa508a6c4f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -230,6 +230,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -238,7 +244,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -357,9 +363,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-04-25 16:35   ` [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-04-25 16:36   ` Noah Goldstein
  2022-04-27 10:47     ` Florian Weimer
  2022-04-25 16:36   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:36 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-04-25 16:36   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-04-25 16:36   ` Noah Goldstein
  2022-04-27 10:43     ` Florian Weimer
  2022-04-27 15:02     ` Alexander Monakov
  4 siblings, 2 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-25 16:36 UTC (permalink / raw)
  To: libc-alpha

Unroll slightly so some of the multiples can be pipelined on out-order
machines. Unrolling further started to induce slowdowns for sizes
[0, 4] but can help the loop so if larger sizes are the target
further unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522
---
 elf/dl-new-hash.h | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 52eef4e417..b0026706bd 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,14 +20,33 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static uint32_t
+__attribute__ ((unused))
 _dl_new_hash (const char *s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  unsigned int h = 5381;
+  unsigned char c0, c1;
+  for (;;)
+    {
+      c0 = *s;
+      /* Unlikely length zero string so evens will be slightly less
+         common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = *(s + 1);
+      if (c1 == 0)
+	{
+	  return h * 33 + c0;
+	}
+      h = 33 * 33 * h + 33 * c0 + c1;
+      s += 2;
+    }
 }
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-25 15:58       ` Noah Goldstein
@ 2022-04-26  8:35         ` Florian Weimer
  2022-04-26 21:39           ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Florian Weimer @ 2022-04-26  8:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Noah Goldstein via Libc-alpha

* Noah Goldstein:

> On Mon, Apr 25, 2022 at 10:38 AM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * Noah Goldstein via Libc-alpha:
>>
>> > +static int
>> > +do_fill_tests (size_t len, int fill)
>> > +{
>> > +  uint32_t expec, res;
>> > +  char buf[len];
>> > +  memset (buf, fill, len);
>> > +
>> > +  expec = simple_nss_hash (buf, len);
>> > +  res = __nss_hash (buf, len);
>> > +  if (expec != res)
>> > +    {
>> > +      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
>> > +      return 1;
>> > +    }
>> > +
>> > +  return 0;
>> > +}
>>
>> %lu needs to be %zu, otherwise this fails to build on various
>> architectures.
>
> Fixed in v3.

Thanks; I can confirm the new version builds fine.

Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-26  8:35         ` Florian Weimer
@ 2022-04-26 21:39           ` Noah Goldstein
  2022-04-27 10:48             ` Florian Weimer
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-26 21:39 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha

On Tue, Apr 26, 2022 at 3:36 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein:
>
> > On Mon, Apr 25, 2022 at 10:38 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Noah Goldstein via Libc-alpha:
> >>
> >> > +static int
> >> > +do_fill_tests (size_t len, int fill)
> >> > +{
> >> > +  uint32_t expec, res;
> >> > +  char buf[len];
> >> > +  memset (buf, fill, len);
> >> > +
> >> > +  expec = simple_nss_hash (buf, len);
> >> > +  res = __nss_hash (buf, len);
> >> > +  if (expec != res)
> >> > +    {
> >> > +      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
> >> > +      return 1;
> >> > +    }
> >> > +
> >> > +  return 0;
> >> > +}
> >>
> >> %lu needs to be %zu, otherwise this fails to build on various
> >> architectures.
> >
> > Fixed in v3.
>
> Thanks; I can confirm the new version builds fine.
>
> Florian

Thanks, good to push?
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-25 16:35   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-27 10:39     ` Florian Weimer
  2022-04-27 16:24       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Florian Weimer @ 2022-04-27 10:39 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha

* Noah Goldstein via Libc-alpha:

> diff --git a/nss/Makefile b/nss/Makefile
> index d8b06b44fb..a978e3927a 100644

> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  uint32_t expec, res;
> +  char buf[len];
> +  memset (buf, fill, len);
> +
> +  expec = simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    {
> +      printf ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
> +      return 1;

You could use FAIL_EXIT1 here.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-25 16:36   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-04-27 10:43     ` Florian Weimer
  2022-04-27 16:25       ` Noah Goldstein
  2022-04-27 15:02     ` Alexander Monakov
  1 sibling, 1 reply; 167+ messages in thread
From: Florian Weimer @ 2022-04-27 10:43 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha

* Noah Goldstein via Libc-alpha:

> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 52eef4e417..b0026706bd 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,14 +20,33 @@
>  #define _DL_NEW_HASH_H 1
>  
>  #include <stdint.h>
>  /* For __glibc_unlikely.  */
>  #include <sys/cdefs.h>
>  
>  static uint32_t
> +__attribute__ ((unused))
>  _dl_new_hash (const char *s)
>  {

Does this change belong here, into this commit?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-04-25 16:36   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-04-27 10:47     ` Florian Weimer
  0 siblings, 0 replies; 167+ messages in thread
From: Florian Weimer @ 2022-04-27 10:47 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha

* Noah Goldstein via Libc-alpha:

> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index 27a348ea9b..c6a375f386 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -19,58 +19,63 @@
>  
>  /* This is from libc/db/hash/hash_func.c, hash3 is static there */
>  /*
> - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
>   * units.  On the first time through the loop we get the "leftover bytes"
> - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> - * this routine is heavily used enough, it's worth the ugly coding.
> + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> + * HASHC. Further unrolling does not appear to help.

I wonder if this optimization is worth it.  This is used in the nscd
interface only, right?

dl_new_hash performance is at least important to symbol lookup.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-26 21:39           ` Noah Goldstein
@ 2022-04-27 10:48             ` Florian Weimer
  2022-04-27 15:02               ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Florian Weimer @ 2022-04-27 10:48 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Noah Goldstein via Libc-alpha

* Noah Goldstein:

> On Tue, Apr 26, 2022 at 3:36 AM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * Noah Goldstein:
>>
>> > On Mon, Apr 25, 2022 at 10:38 AM Florian Weimer <fweimer@redhat.com> wrote:
>> >>
>> >> * Noah Goldstein via Libc-alpha:
>> >>
>> >> > +static int
>> >> > +do_fill_tests (size_t len, int fill)
>> >> > +{
>> >> > +  uint32_t expec, res;
>> >> > +  char buf[len];
>> >> > +  memset (buf, fill, len);
>> >> > +
>> >> > +  expec = simple_nss_hash (buf, len);
>> >> > +  res = __nss_hash (buf, len);
>> >> > +  if (expec != res)
>> >> > +    {
>> >> > +      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
>> >> > +      return 1;
>> >> > +    }
>> >> > +
>> >> > +  return 0;
>> >> > +}
>> >>
>> >> %lu needs to be %zu, otherwise this fails to build on various
>> >> architectures.
>> >
>> > Fixed in v3.
>>
>> Thanks; I can confirm the new version builds fine.
>>
>> Florian
>
> Thanks, good to push?

You said you wanted to post a v4, I think?

Anyway, I commented on a few nits.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-25 16:36   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-04-27 10:43     ` Florian Weimer
@ 2022-04-27 15:02     ` Alexander Monakov
       [not found]       ` <CAFUsyfKeocq4VAusvnggq-NR=tOQTjrD0Z6r3CYCTjGQ=tGGSw@mail.gmail.com>
  1 sibling, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-04-27 15:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Mon, 25 Apr 2022, Noah Goldstein via Libc-alpha wrote:

> Unroll slightly so some of the multiples can be pipelined on out-order
> machines. Unrolling further started to induce slowdowns for sizes
> [0, 4] but can help the loop so if larger sizes are the target
> further unrolling can be beneficial.

Note, the original algorithm does not need a literal multiplication (with
3cyc latency), as h*33 == (h << 5) + h:

> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;

in musl we spell this out as 'h += h*32 + c' to avoid GCC emitting a
multiplication at -Os.

In 'h = (h + c) + (h << 5)' critical path has latency of only 2 cycles,
and (h + c) goes independent of (h << 5).

(if the original loop is implemented with a multiplication, its critical
patch has 4-cycle latency, 3cyc from the multiplication plus 1 from the
addition)

However, when you reroll the loop and overlap two iterations, multiplication
by 33*33 no longer has this nice property and runs with two 4cyc paths
overlapped (so effective critical path is the same as original).

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-27 10:48             ` Florian Weimer
@ 2022-04-27 15:02               ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 15:02 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha

On Wed, Apr 27, 2022 at 5:48 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein:
>
> > On Tue, Apr 26, 2022 at 3:36 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Noah Goldstein:
> >>
> >> > On Mon, Apr 25, 2022 at 10:38 AM Florian Weimer <fweimer@redhat.com> wrote:
> >> >>
> >> >> * Noah Goldstein via Libc-alpha:
> >> >>
> >> >> > +static int
> >> >> > +do_fill_tests (size_t len, int fill)
> >> >> > +{
> >> >> > +  uint32_t expec, res;
> >> >> > +  char buf[len];
> >> >> > +  memset (buf, fill, len);
> >> >> > +
> >> >> > +  expec = simple_nss_hash (buf, len);
> >> >> > +  res = __nss_hash (buf, len);
> >> >> > +  if (expec != res)
> >> >> > +    {
> >> >> > +      printf ("FAIL: fill(%d) (%lu), %x != %x\n", fill, len, expec, res);
> >> >> > +      return 1;
> >> >> > +    }
> >> >> > +
> >> >> > +  return 0;
> >> >> > +}
> >> >>
> >> >> %lu needs to be %zu, otherwise this fails to build on various
> >> >> architectures.
> >> >
> >> > Fixed in v3.
> >>
> >> Thanks; I can confirm the new version builds fine.
> >>
> >> Florian
> >
> > Thanks, good to push?
>
> You said you wanted to post a v4, I think?

Recent v3 like a doofus, will mix your nits as well as Adhemerval's in V4.
>
> Anyway, I commented on a few nits.
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (9 preceding siblings ...)
  2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
@ 2022-04-27 16:19 ` Noah Goldstein
  2022-04-27 16:19   ` [PATCH v4 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (4 more replies)
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (6 subsequent siblings)
  17 siblings, 5 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:19 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..40d88c81f9
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,35 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v4 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-04-27 16:19   ` Noah Goldstein
  2022-04-27 16:19   ` [PATCH v4 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:19 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index 8ed6c3b0b1..493409715e 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..21d72788dd
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		  res);
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v4 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-04-27 16:19   ` [PATCH v4 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-04-27 16:19   ` Noah Goldstein
  2022-04-27 16:20   ` [PATCH v4 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:19 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 104 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..c6c119730d
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,104 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v4 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-04-27 16:19   ` [PATCH v4 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-04-27 16:19   ` [PATCH v4 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-04-27 16:20   ` Noah Goldstein
  2022-04-27 16:20   ` [PATCH v4 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
  2022-04-27 16:20   ` [PATCH v4 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:20 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..aa508a6c4f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -230,6 +230,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -238,7 +244,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -357,9 +363,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v4 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-04-27 16:20   ` [PATCH v4 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-04-27 16:20   ` Noah Goldstein
  2022-04-27 16:20   ` [PATCH v4 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:20 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v4 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-04-27 16:20   ` [PATCH v4 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-04-27 16:20   ` Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:20 UTC (permalink / raw)
  To: libc-alpha

Unroll slightly so some of the multiples can be pipelined on out-order
machines. Unrolling further started to induce slowdowns for sizes
[0, 4] but can help the loop so if larger sizes are the target
further unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522
---
 elf/dl-new-hash.h | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..79a2d79465 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,33 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static inline uint32_t
 __attribute__ ((unused))
 _dl_new_hash (const char *s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  unsigned int h = 5381;
+  unsigned char c0, c1;
+  for (;;)
+    {
+      c0 = *s;
+      /* Unlikely length zero string so evens will be slightly less
+         common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = *(s + 1);
+      if (c1 == 0)
+	{
+	  return h * 33 + c0;
+	}
+      h = 33 * 33 * h + 33 * c0 + c1;
+      s += 2;
+    }
 }
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
       [not found]         ` <f54f1ec9-fc31-283f-bce9-59fd8bda98ad@ispras.ru>
@ 2022-04-27 16:23           ` Noah Goldstein
  2022-04-28 18:03             ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:23 UTC (permalink / raw)
  To: Alexander Monakov, GNU C Library

On Wed, Apr 27, 2022 at 11:17 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Wed, 27 Apr 2022, Noah Goldstein wrote:
>
> > > However, when you reroll the loop and overlap two iterations, multiplication
> > > by 33*33 no longer has this nice property and runs with two 4cyc paths
> > > overlapped (so effective critical path is the same as original).
> >
> > the 33 * c0 can still use `addl; sall; addl` so not sure what you mean by
> > two 4cyc paths overlapped. Its one 4c path.
> >
> > `imul; addl` and `addl; sall; addl`.
> >
> > But it's fair that either wait its 4c of computation for 2 iterations. The
> > difference is the 5c load latency being amortized over 2 iterations
> > or 1 iteration.
>
> Right, it's one 4c path, I was thinking about something else for a moment.
> I'm not sure it's correct to amortize load latency like that, I'd say the
> difference is just that the original loop cannot issue two loads at once
> because of the dependency in its address computation.
>
> I see you dropped libc-alpha@ from Cc:, was that intentional?
No misclick sorry. Adding it back.

I think it is the way you're doing your analysis as a loop-carried
dependency. I.e really 7c per iteration with no unroll (although
its fair the loads on address can speculate ahead so it will
indeed be faster) vs 9c per 2x iterations.

>
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-04-27 10:39     ` Florian Weimer
@ 2022-04-27 16:24       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:24 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha

On Wed, Apr 27, 2022 at 5:39 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > diff --git a/nss/Makefile b/nss/Makefile
> > index d8b06b44fb..a978e3927a 100644
>
> > +static int
> > +do_fill_tests (size_t len, int fill)
> > +{
> > +  uint32_t expec, res;
> > +  char buf[len];
> > +  memset (buf, fill, len);
> > +
> > +  expec = simple_nss_hash (buf, len);
> > +  res = __nss_hash (buf, len);
> > +  if (expec != res)
> > +    {
> > +      printf ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
> > +      return 1;
>
> You could use FAIL_EXIT1 here.

Fixed in V4.
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-27 10:43     ` Florian Weimer
@ 2022-04-27 16:25       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-04-27 16:25 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha

On Wed, Apr 27, 2022 at 5:43 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > index 52eef4e417..b0026706bd 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,14 +20,33 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> >  /* For __glibc_unlikely.  */
> >  #include <sys/cdefs.h>
> >
> >  static uint32_t
> > +__attribute__ ((unused))
> >  _dl_new_hash (const char *s)
> >  {
>
> Does this change belong here, into this commit?

Moved it to the refactor commit in V4.
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-27 16:23           ` Noah Goldstein
@ 2022-04-28 18:03             ` Alexander Monakov
  2022-05-04 18:04               ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-04-28 18:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Wed, 27 Apr 2022, Noah Goldstein via Libc-alpha wrote:

> I think it is the way you're doing your analysis as a loop-carried
> dependency. I.e really 7c per iteration with no unroll (although
> its fair the loads on address can speculate ahead so it will
> indeed be faster) vs 9c per 2x iterations.

Hm? Right, the CPU will issue loads speculatively, so you shouldn't count
load latency as part of critical path.

I don't understand how you get a 2x improvement on long strings, did you
run the benchmark with rdtscp timing, i.e. with

    make USE_RDTSCP=1 bench

?

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-04-28 18:03             ` Alexander Monakov
@ 2022-05-04 18:04               ` Alexander Monakov
  2022-05-05 11:07                 ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-04 18:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

Hi,

I managed to run the new benchtest; I understand how you get a 2x speedup
on long inputs now.

Tigerlake belongs to Intel CPU generations that don't perform move
elimination for general registers (along with Icelake and Sandybridge).
So register copy in preparation to shift 'h' by 5 costs an extra cycle
on the critical path.

Furthermore, h*33 + *c gets evaluated as '((h + h*32) + *c) instead of
'((h + *c) + h*32)', which prevents interleaving the additions and thus
puts one extra add on the critical path.

(gcc-11 gets this right if you assist it by writing 'h = h + *c + h*32')

So due to the above issues, on Tigerlake you get 4 cycles for the original
loop, and also 4 cycles for the modified two-at-a-time loop.

(on top of that, gcc-10 fails to eliminate extra zero-extends and ends up
with two zero-extends for *c in the loop)

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-04 18:04               ` Alexander Monakov
@ 2022-05-05 11:07                 ` Alexander Monakov
  2022-05-05 15:10                   ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-05 11:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Wed, 4 May 2022, Alexander Monakov wrote:
> Tigerlake belongs to Intel CPU generations that don't perform move
> elimination for general registers (along with Icelake and Sandybridge).
> So register copy in preparation to shift 'h' by 5 costs an extra cycle
> on the critical path.
> 
> Furthermore, h*33 + *c gets evaluated as '((h + h*32) + *c) instead of
> '((h + *c) + h*32)', which prevents interleaving the additions and thus
> puts one extra add on the critical path.
> 
> (gcc-11 gets this right if you assist it by writing 'h = h + *c + h*32')
> 
> So due to the above issues, on Tigerlake you get 4 cycles for the original
> loop, and also 4 cycles for the modified two-at-a-time loop.

The following variant of the original loop avoids those issues and
should run close to 2 cycles per iteration on most CPUs:

static uint32_t
_dl_new_hash (const char *s)
{
  uint32_t h = 5381, c;
  const unsigned char *us = (const void *)s;
  while ((c = *us++))
    {
      c += h;
      asm("" : "+r"(h) : "r"(c));
      h = h * 32 + c;
    }
  return h;
}

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-05 11:07                 ` Alexander Monakov
@ 2022-05-05 15:10                   ` Noah Goldstein
  2022-05-05 15:26                     ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-05 15:10 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: GNU C Library

On Thu, May 5, 2022 at 6:07 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Wed, 4 May 2022, Alexander Monakov wrote:
> > Tigerlake belongs to Intel CPU generations that don't perform move
> > elimination for general registers (along with Icelake and Sandybridge).
> > So register copy in preparation to shift 'h' by 5 costs an extra cycle
> > on the critical path.
> >
> > Furthermore, h*33 + *c gets evaluated as '((h + h*32) + *c) instead of
> > '((h + *c) + h*32)', which prevents interleaving the additions and thus
> > puts one extra add on the critical path.
> >
> > (gcc-11 gets this right if you assist it by writing 'h = h + *c + h*32')
> >
> > So due to the above issues, on Tigerlake you get 4 cycles for the original
> > loop, and also 4 cycles for the modified two-at-a-time loop.
>
> The following variant of the original loop avoids those issues and
> should run close to 2 cycles per iteration on most CPUs:
>
> static uint32_t
> _dl_new_hash (const char *s)
> {
>   uint32_t h = 5381, c;
>   const unsigned char *us = (const void *)s;
>   while ((c = *us++))
>     {
>       c += h;
>       asm("" : "+r"(h) : "r"(c));
>       h = h * 32 + c;
>     }
>   return h;
> }
>

I'm not sure what you are getting at with the asm(). It seems the
produce the exact
same assembly:
https://godbolt.org/z/93qGMaTTE
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-05 15:10                   ` Noah Goldstein
@ 2022-05-05 15:26                     ` Alexander Monakov
  2022-05-05 18:03                       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-05 15:26 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Thu, 5 May 2022, Noah Goldstein wrote:

> > The following variant of the original loop avoids those issues and
> > should run close to 2 cycles per iteration on most CPUs:
> >
> > static uint32_t
> > _dl_new_hash (const char *s)
> > {
> >   uint32_t h = 5381, c;
> >   const unsigned char *us = (const void *)s;
> >   while ((c = *us++))
> >     {
> >       c += h;
> >       asm("" : "+r"(h) : "r"(c));
> >       h = h * 32 + c;
> >     }
> >   return h;
> > }
> >
> 
> I'm not sure what you are getting at with the asm(). It seems the
> produce the exact
> same assembly:
> https://godbolt.org/z/93qGMaTTE

They are definitely not the same even via your link. Loop body of dl_new_hash0:

.L3:
(a)	addl    %eax, %edx
	addq    $1, %rcx
(b)	sall    $5, %eax
(c)	addl    %edx, %eax
	movzbl  -1(%rcx), %edx
	testl   %edx, %edx
	jne     .L3

and of dl_new_hash1:

.L9:
(A)	movl    %r8d, %eax
	addq    $1, %rcx
(B)	sall    $5, %eax
(C)	addl    %edx, %eax
	movzbl  -1(%rcx), %edx
(D)	addl    %eax, %r8d
	testl   %edx, %edx
	jne     .L9

(in fact even the instruction count is not the same, 7 vs 8)

In the first loop, (a) and (b) are independent, (c) depends on them both, and
on the next iteration (a) and (b) take the result of (c) from the previous. Thus
the dependencies are 2 cycles for one iteration.

In the second loop, (B) depends on (A), (C) depends on (B), (D) depends on (C),
and on the next iteration (A) depends on (D) from the previous. Thus four
instructions form a dependency chain of 3 or 4 cycles depending if move
elimination happens or not.

In any case, if you benchmark them both you should see the difference.

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-05 15:26                     ` Alexander Monakov
@ 2022-05-05 18:03                       ` Noah Goldstein
  2022-05-05 19:37                         ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-05 18:03 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: GNU C Library

On Thu, May 5, 2022 at 10:26 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Thu, 5 May 2022, Noah Goldstein wrote:
>
> > > The following variant of the original loop avoids those issues and
> > > should run close to 2 cycles per iteration on most CPUs:
> > >
> > > static uint32_t
> > > _dl_new_hash (const char *s)
> > > {
> > >   uint32_t h = 5381, c;
> > >   const unsigned char *us = (const void *)s;
> > >   while ((c = *us++))
> > >     {
> > >       c += h;
> > >       asm("" : "+r"(h) : "r"(c));
> > >       h = h * 32 + c;
> > >     }
> > >   return h;
> > > }
> > >
> >
> > I'm not sure what you are getting at with the asm(). It seems the
> > produce the exact
> > same assembly:
> > https://godbolt.org/z/93qGMaTTE
>
> They are definitely not the same even via your link. Loop body of dl_new_hash0:

Oh wow. Must have 'looked at it' before it reloaded.
I'm sorry!
>
> .L3:
> (a)     addl    %eax, %edx
>         addq    $1, %rcx
> (b)     sall    $5, %eax
> (c)     addl    %edx, %eax
>         movzbl  -1(%rcx), %edx
>         testl   %edx, %edx
>         jne     .L3
>
> and of dl_new_hash1:
>
> .L9:
> (A)     movl    %r8d, %eax
>         addq    $1, %rcx
> (B)     sall    $5, %eax
> (C)     addl    %edx, %eax
>         movzbl  -1(%rcx), %edx
> (D)     addl    %eax, %r8d
>         testl   %edx, %edx
>         jne     .L9
>
> (in fact even the instruction count is not the same, 7 vs 8)
>
> In the first loop, (a) and (b) are independent, (c) depends on them both, and
> on the next iteration (a) and (b) take the result of (c) from the previous. Thus
> the dependencies are 2 cycles for one iteration.
>
> In the second loop, (B) depends on (A), (C) depends on (B), (D) depends on (C),
> and on the next iteration (A) depends on (D) from the previous. Thus four
> instructions form a dependency chain of 3 or 4 cycles depending if move
> elimination happens or not.
>
> In any case, if you benchmark them both you should see the difference.
>
Okay that makes sense and indeed results in a substantial improvement.

Totally happy with going with your version.

Think there is still some benefit to the unrolled version because

1) It's less eager about hitting the LSD on newer processors
(but that's really only an issue for strings > ~24 characters).

2) It bottlenecks less hard on `p6` because the `imul` goes `p0`
and the branches are distributed between `p0` and `p6` instead of
always on `p6`.

3) It still saves a few uops (although imul vs `add + shl` isn't really
a meaningful save).

Either way it will be an improvement.

Little benchmark: https://godbolt.org/z/G6PvW4eTr

Generally see hash2 winning the most.

> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-05 18:03                       ` Noah Goldstein
@ 2022-05-05 19:37                         ` Alexander Monakov
  2022-05-05 22:51                           ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-05 19:37 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Thu, 5 May 2022, Noah Goldstein wrote:
> Okay that makes sense and indeed results in a substantial improvement.
> 
> Totally happy with going with your version.
> 
> Think there is still some benefit to the unrolled version because
> 
> 1) It's less eager about hitting the LSD on newer processors
> (but that's really only an issue for strings > ~24 characters).
> 
> 2) It bottlenecks less hard on `p6` because the `imul` goes `p0`
> and the branches are distributed between `p0` and `p6` instead of
> always on `p6`.
> 
> 3) It still saves a few uops (although imul vs `add + shl` isn't really
> a meaningful save).

Agreed; let me point out though that your variant is preferable provided that
multiplication is at most 3 cycles (or integer multiply-add is at most 4).
That's a given on not-too-old x86, but I'm not sure how things stand elsewhere.

On the other hand, if a 3-cycle multiply-add is available, you variant is
strongly preferable (this is generic code so we should try to think a bit
about other architectures).

I'd recommend to reword commit message of your patch 6/6 so it properly
explains that the apparent 2x speedup is due to baseline case hitting
two issues that slow it down by, well, almost 2x.

> Either way it will be an improvement.
> 
> Little benchmark: https://godbolt.org/z/G6PvW4eTr
> 
> Generally see hash2 winning the most.

I noticed that hash2 needs more magic empty asms, because the compiler may
reassociate 'h = h + (32 * c0 + c1);' to 'h = (h + c1) + 32 * c0' which
increases the dependency chain. I adjusted it like this:

        c1 += c0;
        asm("" : "+r"(c1), "+r"(c0));
        h *= 33 * 33;
        c1 += c0 * 32;
        asm("" : "+r"(c1));
        h += c1;
        s += 2;

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-05 19:37                         ` Alexander Monakov
@ 2022-05-05 22:51                           ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-05 22:51 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: GNU C Library

On Thu, May 5, 2022 at 2:37 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Thu, 5 May 2022, Noah Goldstein wrote:
> > Okay that makes sense and indeed results in a substantial improvement.
> >
> > Totally happy with going with your version.
> >
> > Think there is still some benefit to the unrolled version because
> >
> > 1) It's less eager about hitting the LSD on newer processors
> > (but that's really only an issue for strings > ~24 characters).
> >
> > 2) It bottlenecks less hard on `p6` because the `imul` goes `p0`
> > and the branches are distributed between `p0` and `p6` instead of
> > always on `p6`.
> >
> > 3) It still saves a few uops (although imul vs `add + shl` isn't really
> > a meaningful save).
>
> Agreed; let me point out though that your variant is preferable provided that
> multiplication is at most 3 cycles (or integer multiply-add is at most 4).
> That's a given on not-too-old x86, but I'm not sure how things stand elsewhere.
>
> On the other hand, if a 3-cycle multiply-add is available, you variant is
> strongly preferable (this is generic code so we should try to think a bit
> about other architectures).

Does anyone have any thoughts on this? I generally think 32-bit multiply
is safe (as opposed to 64-bit). But if anyone has any feedback these
are the three versions in contention:

/* Pros: No multiply. Good scheduling.
   Cons: slightly slower than alternative (tested on modern x86). Ugly
asm statement. */
dl_new_hash0(const char * s) {
    uint32_t              h  = 5381, c;
    const unsigned char * us = (const void *)s;
    while ((c = *us++)) {
        c += h;
        asm("" : "+r"(h) : "r"(c));
        h = h * 32 + c;
    }
    return h;
}

/* Pros: Fastest version (tested on modern x86). Good scheduling.
   Cons: 32 bit multiply. Ugly asm statement. */
dl_new_hash2(const unsigned char * s) {
    unsigned int h = 5381;
    unsigned int c0, c1;
    for (;;) {
        c0 = (unsigned int)(*s);
        /* Unlikely length zero string so evens will be slightly less
           common.  */
        if (__glibc_unlikely(c0 == 0)) {
            return h;
        }

        c1 = (unsigned int)(*(s + 1));
        if (c1 == 0) {
            c0 += h;
            asm("" : "+r"(h) : "r"(c0));
            h = h * 32 + c0;
            return h;
        }
        c1 += c0;
        asm("" : "+r"(c1), "+r"(c0));
         h *= 33 * 33;
         c1 += c0 * 32;
         asm("" : "+r"(c1));
         h += c1;
         s += 2;
    }
}

/* Pros: Faster than hash0, slower than hash2. No asm statement.
   Cons: 32 bit multiply, compiler can slightly de-optimized scheduling. */
dl_new_hash3(const char * s) {
    unsigned int  h = 5381;
    unsigned char c0, c1;
    for (;;) {
        c0 = *s;
        /* Unlikely length zero string so evens will be slightly less
           common.  */
        if (__glibc_unlikely(c0 == 0)) {
            return h;
        }

        c1 = *(s + 1);
        if (c1 == 0) {
            return h * 33 + c0;
        }
        h = 33 * 33 * h + 33 * c0 + c1;
        s += 2;
    }
}



>
> I'd recommend to reword commit message of your patch 6/6 so it properly
> explains that the apparent 2x speedup is due to baseline case hitting
> two issues that slow it down by, well, almost 2x.

Makes sense. I'll change in next version (going to leave a bit of time for
feedback on the 32-bit multiply).
>
> > Either way it will be an improvement.
> >
> > Little benchmark: https://godbolt.org/z/G6PvW4eTr
> >
> > Generally see hash2 winning the most.
>
> I noticed that hash2 needs more magic empty asms, because the compiler may
> reassociate 'h = h + (32 * c0 + c1);' to 'h = (h + c1) + 32 * c0' which
> increases the dependency chain. I adjusted it like this:
>
>         c1 += c0;
>         asm("" : "+r"(c1), "+r"(c0));
>         h *= 33 * 33;
>         c1 += c0 * 32;
>         asm("" : "+r"(c1));
>         h += c1;
>         s += 2;

Bright. Thanks!
>
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (10 preceding siblings ...)
  2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-09 17:17 ` Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (4 more replies)
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (5 subsequent siblings)
  17 siblings, 5 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-09 17:17 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..40d88c81f9
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,35 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v5 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-09 17:17   ` Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-09 17:17 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index fc9860edee..0e72f913a0 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..e806a274ca
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		  res);
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v5 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-09 17:17   ` Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-09 17:17 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..6bb2ce06ab
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v5 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-09 17:17   ` Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-09 17:17 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v5 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-09 17:17   ` [PATCH v5 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-09 17:17   ` Noah Goldstein
  2022-05-09 17:17   ` [PATCH v5 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-09 17:17 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v5 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-09 17:17   ` [PATCH v5 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-09 17:17   ` Noah Goldstein
  2022-05-10 11:58     ` Adhemerval Zanella
  4 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-09 17:17 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..70891d374c 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,55 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *signed_s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = signed_s;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = (unsigned int) *s;
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = (unsigned int) *(s + 1);
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	     c0 += h;
+	     h *= 32;
+	     h += c0;
+	     The asm statement ensures the compiler can't mess that up.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+      return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+	 The asm statements ensures the compiler can't mess that up.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }
 
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v5 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-09 17:17   ` [PATCH v5 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-10 11:58     ` Adhemerval Zanella
  0 siblings, 0 replies; 167+ messages in thread
From: Adhemerval Zanella @ 2022-05-10 11:58 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha; +Cc: Alexander Monakov



On 09/05/2022 14:17, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.791
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    0.641,    0.658,               0.974
>  fixed,      1,    1.888,    1.883,               1.003
>  fixed,      2,    2.712,    2.833,               0.957
>  fixed,      3,    3.314,    3.739,               0.886
>  fixed,      4,    4.316,    4.866,               0.887
>  fixed,      5,     5.16,    5.966,               0.865
>  fixed,      6,    5.986,    7.241,               0.827
>  fixed,      7,    7.264,    8.435,               0.861
>  fixed,      8,    8.052,    9.846,               0.818
>  fixed,      9,    9.369,   11.316,               0.828
>  fixed,     10,   10.256,   12.925,               0.794
>  fixed,     11,   12.191,   14.546,               0.838
>  fixed,     12,   12.667,    15.92,               0.796
>  fixed,     13,   14.442,   17.465,               0.827
>  fixed,     14,   14.808,   18.981,                0.78
>  fixed,     15,   16.244,   20.565,                0.79
>  fixed,     16,   17.166,   22.044,               0.779
>  fixed,     32,   35.447,   50.558,               0.701
>  fixed,     64,   86.479,  134.529,               0.643
>  fixed,    128,  155.453,  287.527,               0.541
>  fixed,    256,   302.57,   593.64,                0.51
> random,      2,   11.168,    10.61,               1.053
> random,      4,   13.308,    13.53,               0.984
> random,      8,   16.579,   19.437,               0.853
> random,     16,   21.292,   24.776,               0.859
> random,     32,    30.56,   35.906,               0.851
> random,     64,   49.249,   68.577,               0.718
> random,    128,   81.845,  140.664,               0.582
> random,    256,  152.517,  292.204,               0.522
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>

Buildbot failed to build it [1]:

make[2]: Entering directory '/glibc/elf'
gcc -m32 dl-lookup.c -c -std=gnu11 -fgnu89-inline  -g -O2 -Wall -Wwrite-strings -Wundef -Werror -fmerge-all-constants -frounding-math -fno-stack-protector -fno-common -Wstrict-prototypes -Wold-style-definition -fmath-errno    -fPIC  -fno-stack-protector -DSTACK_PROTECTOR_LEVEL=0 -Wa,-mtune=i686  -mno-sse -mno-mmx -mfpmath=387    -fexceptions -fasynchronous-unwind-tables  -ftls-model=initial-exec      -I../include -I/build/elf  -I/build  -I../sysdeps/unix/sysv/linux/i386/i686  -I../sysdeps/i386/i686/nptl  -I../sysdeps/unix/sysv/linux/i386  -I../sysdeps/unix/sysv/linux/x86/include -I../sysdeps/unix/sysv/linux/x86  -I../sysdeps/x86/nptl  -I../sysdeps/i386/nptl  -I../sysdeps/unix/sysv/linux/include -I../sysdeps/unix/sysv/linux  -I../sysdeps/nptl  -I../sysdeps/pthread  -I../sysdeps/gnu  -I../sysdeps/unix/inet  -I../sysdeps/unix/sysv  -I../sysdeps/unix/i386  -I../sysdeps/unix  -I../sysdeps/posix  -I../sysdeps/i386/i686/fpu/multiarch  -I../sysdeps/i386/i686/fpu  -I../sysdeps/i386/i686/multiarch  -I../sysdeps/i386/i686  -I../sysdeps/i386/fpu  -I../sysdeps/x86/fpu  -I../sysdeps/i386  -I../sysdeps/x86/include -I../sysdeps/x86  -I../sysdeps/wordsize-32  -I../sysdeps/ieee754/float128  -I../sysdeps/ieee754/ldbl-96/include -I../sysdeps/ieee754/ldbl-96  -I../sysdeps/ieee754/dbl-64  -I../sysdeps/ieee754/flt-32  -I../sysdeps/ieee754  -I../sysdeps/generic  -I.. -I../libio -I.  -D_LIBC_REENTRANT -include /build/libc-modules.h -DMODULE_NAME=rtld -include ../include/libc-symbols.h  -DPIC -DSHARED     -DTOP_NAMESPACE=glibc -o /build/elf/dl-lookup.os -MD -MP -MF /build/elf/dl-lookup.os.dt -MT /build/elf/dl-lookup.os
In file included from dl-lookup.c:27:
./dl-new-hash.h: In function '_dl_new_hash':
./dl-new-hash.h:30:28: error: pointer targets in initialization of 'const unsigned char *' from 'const char *' differ in signedness [-Werror=pointer-sign]
   30 |   const unsigned char *s = signed_s;
      |                            ^~~~~~~~
cc1: all warnings being treated as errors
make[2]: *** [../o-iterator.mk:9: /build/elf/dl-lookup.os] Error 1
gcc -m32 dl-lookup.c -c -std=gnu11 -fgnu89-inline  -g -O2 -Wall -Wwrite-strings -Wundef -Werror -fmerge-all-constants -frounding-math -fno-stack-protector -fno-common -Wstrict-prototypes -Wold-style-definition -fmath-errno    -fpie  -fno-stack-protector -DSTACK_PROTECTOR_LEVEL=0 -Wa,-mtune=i686 -fexceptions -fasynchronous-unwind-tables  -ftls-model=initial-exec      -I../include -I/build/elf  -I/build  -I../sysdeps/unix/sysv/linux/i386/i686  -I../sysdeps/i386/i686/nptl  -I../sysdeps/unix/sysv/linux/i386  -I../sysdeps/unix/sysv/linux/x86/include -I../sysdeps/unix/sysv/linux/x86  -I../sysdeps/x86/nptl  -I../sysdeps/i386/nptl  -I../sysdeps/unix/sysv/linux/include -I../sysdeps/unix/sysv/linux  -I../sysdeps/nptl  -I../sysdeps/pthread  -I../sysdeps/gnu  -I../sysdeps/unix/inet  -I../sysdeps/unix/sysv  -I../sysdeps/unix/i386  -I../sysdeps/unix  -I../sysdeps/posix  -I../sysdeps/i386/i686/fpu/multiarch  -I../sysdeps/i386/i686/fpu  -I../sysdeps/i386/i686/multiarch  -I../sysdeps/i386/i686  -I../sysdeps/i386/fpu  -I../sysdeps/x86/fpu  -I../sysdeps/i386  -I../sysdeps/x86/include -I../sysdeps/x86  -I../sysdeps/wordsize-32  -I../sysdeps/ieee754/float128  -I../sysdeps/ieee754/ldbl-96/include -I../sysdeps/ieee754/ldbl-96  -I../sysdeps/ieee754/dbl-64  -I../sysdeps/ieee754/flt-32  -I../sysdeps/ieee754  -I../sysdeps/generic  -I.. -I../libio -I.  -D_LIBC_REENTRANT -include /build/libc-modules.h -DMODULE_NAME=libc -include ../include/libc-symbols.h  -DPIC     -DTOP_NAMESPACE=glibc -o /build/elf/dl-lookup.o -MD -MP -MF /build/elf/dl-lookup.o.dt -MT /build/elf/dl-lookup.o
In file included from dl-lookup.c:27:
./dl-new-hash.h: In function '_dl_new_hash':
./dl-new-hash.h:30:28: error: pointer targets in initialization of 'const unsigned char *' from 'const char *' differ in signedness [-Werror=pointer-sign]
   30 |   const unsigned char *s = signed_s;
      |                            ^~~~~~~~

[1] https://www.delorie.com/trybots/32bit/9123/make.tail.txt

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (11 preceding siblings ...)
  2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-10 15:04 ` Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (4 more replies)
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (4 subsequent siblings)
  17 siblings, 5 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 15:04 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..40d88c81f9
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,35 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v6 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-10 15:04   ` Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 15:04 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index fc9860edee..0e72f913a0 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..e806a274ca
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		  res);
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v6 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-10 15:04   ` Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 15:04 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..6bb2ce06ab
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v6 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-10 15:04   ` Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 15:04 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v6 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-10 15:04   ` [PATCH v6 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-10 15:04   ` Noah Goldstein
  2022-05-10 15:04   ` [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 15:04 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-10 15:04   ` [PATCH v6 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-10 15:04   ` Noah Goldstein
  2022-05-10 15:29     ` H.J. Lu
  2022-05-10 16:49     ` Alexander Monakov
  4 siblings, 2 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 15:04 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..cacbeec289 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,55 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *signed_s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = (const unsigned char *) signed_s;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = (unsigned int) *s;
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = (unsigned int) *(s + 1);
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	     c0 += h;
+	     h *= 32;
+	     h += c0;
+	     The asm statement ensures the compiler can't mess that up.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+      return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+	 The asm statements ensures the compiler can't mess that up.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }
 
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 15:04   ` [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-10 15:29     ` H.J. Lu
  2022-05-10 15:31       ` H.J. Lu
  2022-05-10 16:49     ` Alexander Monakov
  1 sibling, 1 reply; 167+ messages in thread
From: H.J. Lu @ 2022-05-10 15:29 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell, Alexander Monakov

On Tue, May 10, 2022 at 8:04 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
>
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.791
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    0.641,    0.658,               0.974
>  fixed,      1,    1.888,    1.883,               1.003
>  fixed,      2,    2.712,    2.833,               0.957
>  fixed,      3,    3.314,    3.739,               0.886
>  fixed,      4,    4.316,    4.866,               0.887
>  fixed,      5,     5.16,    5.966,               0.865
>  fixed,      6,    5.986,    7.241,               0.827
>  fixed,      7,    7.264,    8.435,               0.861
>  fixed,      8,    8.052,    9.846,               0.818
>  fixed,      9,    9.369,   11.316,               0.828
>  fixed,     10,   10.256,   12.925,               0.794
>  fixed,     11,   12.191,   14.546,               0.838
>  fixed,     12,   12.667,    15.92,               0.796
>  fixed,     13,   14.442,   17.465,               0.827
>  fixed,     14,   14.808,   18.981,                0.78
>  fixed,     15,   16.244,   20.565,                0.79
>  fixed,     16,   17.166,   22.044,               0.779
>  fixed,     32,   35.447,   50.558,               0.701
>  fixed,     64,   86.479,  134.529,               0.643
>  fixed,    128,  155.453,  287.527,               0.541
>  fixed,    256,   302.57,   593.64,                0.51
> random,      2,   11.168,    10.61,               1.053
> random,      4,   13.308,    13.53,               0.984
> random,      8,   16.579,   19.437,               0.853
> random,     16,   21.292,   24.776,               0.859
> random,     32,    30.56,   35.906,               0.851
> random,     64,   49.249,   68.577,               0.718
> random,    128,   81.845,  140.664,               0.582
> random,    256,  152.517,  292.204,               0.522
>
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 40d88c81f9..cacbeec289 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,55 @@
>  #define _DL_NEW_HASH_H 1
>
>  #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
>
>  static inline uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *signed_s)
>  {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> +  const unsigned char *s = (const unsigned char *) signed_s;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = (unsigned int) *s;

I don't think it is safe for strictly aligned targets.

> +      /* Unlikely length zero string so evens will be slightly less
> +        common.  */
> +      if (__glibc_unlikely (c0 == 0))
> +       {
> +         return h;
> +       }
> +
> +      c1 = (unsigned int) *(s + 1);
> +      if (c1 == 0)
> +       {
> +         c0 += h;
> +         /* Ideal instruction scheduling is:
> +            c0 += h;
> +            h *= 32;
> +            h += c0;
> +            The asm statement ensures the compiler can't mess that up.  */
> +         asm("" : "+r"(h) : "r"(c0));
> +         h = h * 32 + c0;
> +      return h;
> +       }
> +
> +      /* Ideal instruction scheduling is:
> +        c1 += c0;
> +        h *= 33 * 33;
> +        c0 *= 32;
> +        c1 += c0;
> +        h  += c1;
> +        The asm statements ensures the compiler can't mess that up.  */
> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
>  }
>
>
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 15:29     ` H.J. Lu
@ 2022-05-10 15:31       ` H.J. Lu
  0 siblings, 0 replies; 167+ messages in thread
From: H.J. Lu @ 2022-05-10 15:31 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell, Alexander Monakov

On Tue, May 10, 2022 at 8:29 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, May 10, 2022 at 8:04 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. Note the unrolling allows
> > for pipelined multiplies which helps a bit, but most of the gain
> > is from enforcing better instruction scheduling for more ILP.
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=25 runs
> > Geometric of all benchmark New / Old: 0.791
> >   type, length, New Time, Old Time, New Time / Old Time
> >  fixed,      0,    0.641,    0.658,               0.974
> >  fixed,      1,    1.888,    1.883,               1.003
> >  fixed,      2,    2.712,    2.833,               0.957
> >  fixed,      3,    3.314,    3.739,               0.886
> >  fixed,      4,    4.316,    4.866,               0.887
> >  fixed,      5,     5.16,    5.966,               0.865
> >  fixed,      6,    5.986,    7.241,               0.827
> >  fixed,      7,    7.264,    8.435,               0.861
> >  fixed,      8,    8.052,    9.846,               0.818
> >  fixed,      9,    9.369,   11.316,               0.828
> >  fixed,     10,   10.256,   12.925,               0.794
> >  fixed,     11,   12.191,   14.546,               0.838
> >  fixed,     12,   12.667,    15.92,               0.796
> >  fixed,     13,   14.442,   17.465,               0.827
> >  fixed,     14,   14.808,   18.981,                0.78
> >  fixed,     15,   16.244,   20.565,                0.79
> >  fixed,     16,   17.166,   22.044,               0.779
> >  fixed,     32,   35.447,   50.558,               0.701
> >  fixed,     64,   86.479,  134.529,               0.643
> >  fixed,    128,  155.453,  287.527,               0.541
> >  fixed,    256,   302.57,   593.64,                0.51
> > random,      2,   11.168,    10.61,               1.053
> > random,      4,   13.308,    13.53,               0.984
> > random,      8,   16.579,   19.437,               0.853
> > random,     16,   21.292,   24.776,               0.859
> > random,     32,    30.56,   35.906,               0.851
> > random,     64,   49.249,   68.577,               0.718
> > random,    128,   81.845,  140.664,               0.582
> > random,    256,  152.517,  292.204,               0.522
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >  elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 45 insertions(+), 5 deletions(-)
> >
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > index 40d88c81f9..cacbeec289 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,15 +20,55 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> >
> >  static inline uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +_dl_new_hash (const char *signed_s)
> >  {
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > +  const unsigned char *s = (const unsigned char *) signed_s;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = (unsigned int) *s;
>
> I don't think it is safe for strictly aligned targets.

Never mind.  I didn't read it properly.

> > +      /* Unlikely length zero string so evens will be slightly less
> > +        common.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +       {
> > +         return h;
> > +       }
> > +
> > +      c1 = (unsigned int) *(s + 1);
> > +      if (c1 == 0)
> > +       {
> > +         c0 += h;
> > +         /* Ideal instruction scheduling is:
> > +            c0 += h;
> > +            h *= 32;
> > +            h += c0;
> > +            The asm statement ensures the compiler can't mess that up.  */
> > +         asm("" : "+r"(h) : "r"(c0));
> > +         h = h * 32 + c0;
> > +      return h;
> > +       }
> > +
> > +      /* Ideal instruction scheduling is:
> > +        c1 += c0;
> > +        h *= 33 * 33;
> > +        c0 *= 32;
> > +        c1 += c0;
> > +        h  += c1;
> > +        The asm statements ensures the compiler can't mess that up.  */
> > +      c1 += c0;
> > +      asm("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      asm("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> >  }
> >
> >
> > --
> > 2.34.1
> >
>
>
> --
> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 15:04   ` [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-05-10 15:29     ` H.J. Lu
@ 2022-05-10 16:49     ` Alexander Monakov
  2022-05-10 17:17       ` Noah Goldstein
  1 sibling, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-10 16:49 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

On Tue, 10 May 2022, Noah Goldstein wrote:

> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

It seems benchmark figures are unchanged from the first iteration of this
patch, did the revision not affect them?

(more comments below)

> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,55 @@
>  #define _DL_NEW_HASH_H 1
>  
>  #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
>  
>  static inline uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *signed_s)

This is technically inaccurate (whether plain 'char' is signed depends on the
target), so if you're revising this further I'd suggest to change this too to
e.g. 'str'.

>  {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;

I think it would be nice to retain this loop as a comment to indicate what this
function is supposed to implement.

> +  const unsigned char *s = (const unsigned char *) signed_s;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = (unsigned int) *s;

Surprised to see an explicit cast where plain assignment with implicit type
conversion is doing the obvious thing. Is it really necessary?

> +      /* Unlikely length zero string so evens will be slightly less
> +	 common.  */

I had trouble understanding this comment. I'd suggest dropping it or rephrasing
like 'Since hashed string is normally not empty, this is unlikely on the first
iteration of the loop'.

> +      if (__glibc_unlikely (c0 == 0))
> +	{
> +	  return h;
> +	}

Braces look unnecessary.

> +
> +      c1 = (unsigned int) *(s + 1);

Again unnecessary explicit cast here (c1 = s[1] might be easier to read).
Alternatively, you could use 'c1 = *s++' here and above and drop explicit
s += 2 below, I expect resulting assembly to be the same.

> +      if (c1 == 0)
> +	{
> +	  c0 += h;
> +	  /* Ideal instruction scheduling is:
> +	     c0 += h;
> +	     h *= 32;
> +	     h += c0;
> +	     The asm statement ensures the compiler can't mess that up.  */

The main concern here is preventing reassociation from 'h = h*32 + (c0 + h)'
to 'h = (h*32 + h) + c0', not scheduling. We're using an empty asm to break
up a sequence of additions.

Also note that this leads to a return, so only saves a couple cycles once per
call, not inside the loop.

> +	  asm("" : "+r"(h) : "r"(c0));
> +	  h = h * 32 + c0;
> +      return h;

Wrong indentation here.

> +	}
> +
> +      /* Ideal instruction scheduling is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;
> +	 The asm statements ensures the compiler can't mess that up.  */

As above, we are placing empty asms mainly as reassociation barriers.

> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
>  }

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 16:49     ` Alexander Monakov
@ 2022-05-10 17:17       ` Noah Goldstein
  2022-05-10 17:40         ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 17:17 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: GNU C Library, H.J. Lu, Carlos O'Donell

On Tue, May 10, 2022 at 11:49 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Tue, 10 May 2022, Noah Goldstein wrote:
>
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. Note the unrolling allows
> > for pipelined multiplies which helps a bit, but most of the gain
> > is from enforcing better instruction scheduling for more ILP.
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> It seems benchmark figures are unchanged from the first iteration of this
> patch, did the revision not affect them?

Didn't rerun, will do for V7.
>
> (more comments below)
>
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,15 +20,55 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> >
> >  static inline uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +_dl_new_hash (const char *signed_s)
>
> This is technically inaccurate (whether plain 'char' is signed depends on the
> target), so if you're revising this further I'd suggest to change this too to
> e.g. 'str'.

Will fix for V7.
>
> >  {
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
>
> I think it would be nice to retain this loop as a comment to indicate what this
> function is supposed to implement.

Will fix for V7.
>
> > +  const unsigned char *s = (const unsigned char *) signed_s;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = (unsigned int) *s;
>
> Surprised to see an explicit cast where plain assignment with implicit type
> conversion is doing the obvious thing. Is it really necessary?

It's not, I just tend to err on the side of explicitness.

>
> > +      /* Unlikely length zero string so evens will be slightly less
> > +      common.  */
>
> I had trouble understanding this comment. I'd suggest dropping it or rephrasing
> like 'Since hashed string is normally not empty, this is unlikely on the first
> iteration of the loop'.

Will fix for V7.
>
> > +      if (__glibc_unlikely (c0 == 0))
> > +     {
> > +       return h;
> > +     }
>
> Braces look unnecessary.

Will fix v7.
>
> > +
> > +      c1 = (unsigned int) *(s + 1);
>
> Again unnecessary explicit cast here (c1 = s[1] might be easier to read).
> Alternatively, you could use 'c1 = *s++' here and above and drop explicit
> s += 2 below, I expect resulting assembly to be the same.

generally think user `[]` access makes stylistic sense when incrementing
an index and *(s1 + N) makes sense when incrementing a pointer.

I'm generally in favor of leaving the casts/access as is but its not a hill
worth dying on.

LMK if its important to you for V7 (will be a few hours to rerun benchmarks).

>
> > +      if (c1 == 0)
> > +     {
> > +       c0 += h;
> > +       /* Ideal instruction scheduling is:
> > +          c0 += h;
> > +          h *= 32;
> > +          h += c0;
> > +          The asm statement ensures the compiler can't mess that up.  */
>
> The main concern here is preventing reassociation from 'h = h*32 + (c0 + h)'
> to 'h = (h*32 + h) + c0', not scheduling. We're using an empty asm to break
> up a sequence of additions.

Well the reason (h * 32 + h) + c0 is worse is it creates scheduling
dependencies.
No? Seems like if the comment is "to avoid reassociation" the natural question
is why does reassociation matter? To say it's explicitly for
scheduling answers that.

I'll update the comment in V7 to mention both.
>
> Also note that this leads to a return, so only saves a couple cycles once per
> call, not inside the loop.
>
> > +       asm("" : "+r"(h) : "r"(c0));
> > +       h = h * 32 + c0;
> > +      return h;
>
> Wrong indentation here.

Will fix V7.
>
> > +     }
> > +
> > +      /* Ideal instruction scheduling is:
> > +      c1 += c0;
> > +      h *= 33 * 33;
> > +      c0 *= 32;
> > +      c1 += c0;
> > +      h  += c1;
> > +      The asm statements ensures the compiler can't mess that up.  */
>
> As above, we are placing empty asms mainly as reassociation barriers.

Will update to mention both. It really is a missed-optimization in GCC
instruction
scheduling pass though.
>
> > +      c1 += c0;
> > +      asm("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      asm("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> >  }
>
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 17:17       ` Noah Goldstein
@ 2022-05-10 17:40         ` Alexander Monakov
  0 siblings, 0 replies; 167+ messages in thread
From: Alexander Monakov @ 2022-05-10 17:40 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, H.J. Lu, Carlos O'Donell

On Tue, 10 May 2022, Noah Goldstein wrote:
> > > +
> > > +      c1 = (unsigned int) *(s + 1);
> >
> > Again unnecessary explicit cast here (c1 = s[1] might be easier to read).
> > Alternatively, you could use 'c1 = *s++' here and above and drop explicit
> > s += 2 below, I expect resulting assembly to be the same.
> 
> generally think user `[]` access makes stylistic sense when incrementing
> an index and *(s1 + N) makes sense when incrementing a pointer.
> 
> I'm generally in favor of leaving the casts/access as is but its not a hill
> worth dying on.
> 
> LMK if its important to you for V7 (will be a few hours to rerun benchmarks).

Thanks for the explanation. I'm not an active contributor, so I don't mind.

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (12 preceding siblings ...)
  2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-10 23:30 ` Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (4 more replies)
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (3 subsequent siblings)
  17 siblings, 5 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 23:30 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..40d88c81f9
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,35 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v7 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-10 23:30   ` Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 23:30 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index fc9860edee..0e72f913a0 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..e806a274ca
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		  res);
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v7 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-10 23:30   ` Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 23:30 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..6bb2ce06ab
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v7 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-10 23:30   ` Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 23:30 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v7 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-10 23:30   ` [PATCH v7 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-10 23:30   ` Noah Goldstein
  2022-05-10 23:30   ` [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  4 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 23:30 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-10 23:30   ` [PATCH v7 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-10 23:30   ` Noah Goldstein
  2022-05-10 23:46     ` H.J. Lu
  4 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-10 23:30 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..c6d96a0452 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,71 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+_dl_new_hash (const char *s)
+{
+   uint32_t h = 5381;
+   for (unsigned char c = *s; c != '\0'; c = *++s)
+     h = h * 33 + c;
+   return h;
+}
+
+   We can get better performance, however, by slightly unrolling the
+   loop and explicitly specifying order of operations to prevent
+   reassosiation of instructions and ensure ideal scheduling.  */
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *str)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = (unsigned int) *s;
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = (unsigned int) *(s + 1);
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }
 
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 23:30   ` [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-10 23:46     ` H.J. Lu
  2022-05-11  3:07       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: H.J. Lu @ 2022-05-10 23:46 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos, Alexander Monakov

On Tue, May 10, 2022 at 4:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
>
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    2.865,     2.72,               1.053
>  fixed,      1,    3.567,    2.489,               1.433
>  fixed,      2,    2.577,    3.649,               0.706
>  fixed,      3,    3.644,    5.983,               0.609
>  fixed,      4,    4.211,    6.833,               0.616
>  fixed,      5,    4.741,    9.372,               0.506
>  fixed,      6,    5.415,    9.561,               0.566
>  fixed,      7,    6.649,   10.789,               0.616
>  fixed,      8,    8.081,   11.808,               0.684
>  fixed,      9,    8.427,   12.935,               0.651
>  fixed,     10,    8.673,   14.134,               0.614
>  fixed,     11,    10.69,   15.408,               0.694
>  fixed,     12,   10.789,   16.982,               0.635
>  fixed,     13,   12.169,   18.411,               0.661
>  fixed,     14,   12.659,   19.914,               0.636
>  fixed,     15,   13.526,   21.541,               0.628
>  fixed,     16,   14.211,   23.088,               0.616
>  fixed,     32,   29.412,   52.722,               0.558
>  fixed,     64,    65.41,  142.351,               0.459
>  fixed,    128,  138.505,  295.625,               0.469
>  fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
>
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 61 insertions(+), 5 deletions(-)
>
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 40d88c81f9..c6d96a0452 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,71 @@
>  #define _DL_NEW_HASH_H 1
>
>  #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +_dl_new_hash (const char *s)
> +{
> +   uint32_t h = 5381;
> +   for (unsigned char c = *s; c != '\0'; c = *++s)
> +     h = h * 33 + c;
> +   return h;
> +}
> +
> +   We can get better performance, however, by slightly unrolling the
> +   loop and explicitly specifying order of operations to prevent
> +   reassosiation of instructions and ensure ideal scheduling.  */
>
>  static inline uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *str)
>  {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = (unsigned int) *s;

I prefer

c0 = s[0];

There is no need for case since "s" is a pointer to unsigned char.

> +      /* Unlikely length zero string so evens will be slightly less
                                                              even
> +        common.  */
> +      if (__glibc_unlikely (c0 == 0))
> +       return h;
> +
> +      c1 = (unsigned int) *(s + 1);

c1 = s[1];

> +      if (c1 == 0)
> +       {
> +         c0 += h;
> +         /* Ideal instruction scheduling is:
> +        c0 += h;
> +        h *= 32;
> +        h += c0;
> +
> +        The asm statements are to prevent reassosiation that would result in
> +        more instruction interdependencies and worse scheduling.  */
> +         asm("" : "+r"(h) : "r"(c0));
              asm (  << A space after asm.
> +         h = h * 32 + c0;
> +         return h;
> +       }
> +
> +      /* Ideal instruction scheduling is:
> +        c1 += c0;
> +        h *= 33 * 33;
> +        c0 *= 32;
> +        c1 += c0;
> +        h  += c1;
> +
> +        The asm statements are to prevent reassosiation that would result in
> +        more instruction interdependencies and worse scheduling.  */
> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
            asm (  << A space after asm.
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
           asm (  << A space after asm.
> +      h += c1;
> +      s += 2;
> +    }
>  }

This is faster on x86.  Is this also true on other targets?  Should it be moved
to sysdeps/generic so that other targets can provide a different version?

>
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (13 preceding siblings ...)
  2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-11  3:06 ` Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (5 more replies)
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
                   ` (2 subsequent siblings)
  17 siblings, 6 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:06 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..40d88c81f9
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,35 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+
+static inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v8 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-11  3:06   ` Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:06 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index fc9860edee..0e72f913a0 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..e806a274ca
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		  res);
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v8 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-11  3:06   ` Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:06 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..6bb2ce06ab
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v8 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-11  3:06   ` Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:06 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v8 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-11  3:06   ` [PATCH v8 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-11  3:06   ` Noah Goldstein
  2022-05-11  3:06   ` [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-05-16 13:56   ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:06 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-11  3:06   ` [PATCH v8 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-11  3:06   ` Noah Goldstein
  2022-05-16 14:12     ` Siddhesh Poyarekar
  2022-05-16 13:56   ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..5269f6eb98 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,71 @@
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+_dl_new_hash (const char *s)
+{
+   uint32_t h = 5381;
+   for (unsigned char c = *s; c != '\0'; c = *++s)
+     h = h * 33 + c;
+   return h;
+}
+
+   We can get better performance, however, by slightly unrolling the
+   loop and explicitly specifying order of operations to prevent
+   reassosiation of instructions and ensure ideal scheduling.  */
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *str)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = s[0];
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = s[1];
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }
 
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-10 23:46     ` H.J. Lu
@ 2022-05-11  3:07       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-11  3:07 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell, Alexander Monakov

On Tue, May 10, 2022 at 6:47 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, May 10, 2022 at 4:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. Note the unrolling allows
> > for pipelined multiplies which helps a bit, but most of the gain
> > is from enforcing better instruction scheduling for more ILP.
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >   type, length, New Time, Old Time, New Time / Old Time
> >  fixed,      0,    2.865,     2.72,               1.053
> >  fixed,      1,    3.567,    2.489,               1.433
> >  fixed,      2,    2.577,    3.649,               0.706
> >  fixed,      3,    3.644,    5.983,               0.609
> >  fixed,      4,    4.211,    6.833,               0.616
> >  fixed,      5,    4.741,    9.372,               0.506
> >  fixed,      6,    5.415,    9.561,               0.566
> >  fixed,      7,    6.649,   10.789,               0.616
> >  fixed,      8,    8.081,   11.808,               0.684
> >  fixed,      9,    8.427,   12.935,               0.651
> >  fixed,     10,    8.673,   14.134,               0.614
> >  fixed,     11,    10.69,   15.408,               0.694
> >  fixed,     12,   10.789,   16.982,               0.635
> >  fixed,     13,   12.169,   18.411,               0.661
> >  fixed,     14,   12.659,   19.914,               0.636
> >  fixed,     15,   13.526,   21.541,               0.628
> >  fixed,     16,   14.211,   23.088,               0.616
> >  fixed,     32,   29.412,   52.722,               0.558
> >  fixed,     64,    65.41,  142.351,               0.459
> >  fixed,    128,  138.505,  295.625,               0.469
> >  fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >  elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 61 insertions(+), 5 deletions(-)
> >
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > index 40d88c81f9..c6d96a0452 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,15 +20,71 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +_dl_new_hash (const char *s)
> > +{
> > +   uint32_t h = 5381;
> > +   for (unsigned char c = *s; c != '\0'; c = *++s)
> > +     h = h * 33 + c;
> > +   return h;
> > +}
> > +
> > +   We can get better performance, however, by slightly unrolling the
> > +   loop and explicitly specifying order of operations to prevent
> > +   reassosiation of instructions and ensure ideal scheduling.  */
> >
> >  static inline uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +_dl_new_hash (const char *str)
> >  {
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = (unsigned int) *s;
>
> I prefer
>
> c0 = s[0];
>
> There is no need for case since "s" is a pointer to unsigned char.

Fixed in V8.
>
> > +      /* Unlikely length zero string so evens will be slightly less
>                                                               even
> > +        common.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +       return h;
> > +
> > +      c1 = (unsigned int) *(s + 1);
>
> c1 = s[1];

Fixed in V8.
>
> > +      if (c1 == 0)
> > +       {
> > +         c0 += h;
> > +         /* Ideal instruction scheduling is:
> > +        c0 += h;
> > +        h *= 32;
> > +        h += c0;
> > +
> > +        The asm statements are to prevent reassosiation that would result in
> > +        more instruction interdependencies and worse scheduling.  */
> > +         asm("" : "+r"(h) : "r"(c0));
>               asm (  << A space after asm.
> > +         h = h * 32 + c0;
> > +         return h;
> > +       }
> > +
> > +      /* Ideal instruction scheduling is:
> > +        c1 += c0;
> > +        h *= 33 * 33;
> > +        c0 *= 32;
> > +        c1 += c0;
> > +        h  += c1;
> > +
> > +        The asm statements are to prevent reassosiation that would result in
> > +        more instruction interdependencies and worse scheduling.  */
> > +      c1 += c0;
> > +      asm("" : "+r"(c1), "+r"(c0));
>             asm (  << A space after asm.
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      asm("" : "+r"(c1));
>            asm (  << A space after asm.
> > +      h += c1;
> > +      s += 2;
> > +    }
> >  }
>
> This is faster on x86.  Is this also true on other targets?  Should it be moved
> to sysdeps/generic so that other targets can provide a different version?

I haven't tested any other targets but the optimizations are pretty generic so
would imagine similar results on other architectures
>
> >
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-05-11  3:06   ` [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-16 13:56   ` Siddhesh Poyarekar
  2022-05-16 20:31     ` Noah Goldstein
  5 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-16 13:56 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 11/05/2022 08:36, Noah Goldstein via Libc-alpha wrote:
> No change to the code other than moving the function to
> dl-new-hash.h. Changed name so its now in the reserved namespace.
> ---
>   elf/dl-lookup.c   | 13 ++-----------
>   elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
>   2 files changed, 37 insertions(+), 11 deletions(-)
>   create mode 100644 elf/dl-new-hash.h
> 
> diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> index 989b073e4f..a42f6d5390 100644
> --- a/elf/dl-lookup.c
> +++ b/elf/dl-lookup.c
> @@ -24,6 +24,7 @@
>   #include <ldsodefs.h>
>   #include <dl-hash.h>
>   #include <dl-machine.h>
> +#include <dl-new-hash.h>
>   #include <dl-protected.h>
>   #include <sysdep-cancel.h>
>   #include <libc-lock.h>
> @@ -558,16 +559,6 @@ skip:
>   }
>   
>   
> -static uint32_t
> -dl_new_hash (const char *s)
> -{
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> -}
> -
> -
>   /* Add extra dependency on MAP to UNDEF_MAP.  */
>   static int
>   add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
>   		     const struct r_found_version *version,
>   		     int type_class, int flags, struct link_map *skip_map)
>   {
> -  const unsigned int new_hash = dl_new_hash (undef_name);
> +  const unsigned int new_hash = _dl_new_hash (undef_name);
>     unsigned long int old_hash = 0xffffffff;
>     struct sym_val current_value = { NULL, NULL };
>     struct r_scope_elem **scope = symbol_scope;
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> new file mode 100644
> index 0000000000..40d88c81f9
> --- /dev/null
> +++ b/elf/dl-new-hash.h
> @@ -0,0 +1,35 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +
> +static inline uint32_t

This has only one caller and ISTM that it's always desirable to inline 
this, so use __always_inline instead.

> +__attribute__ ((unused))
> +_dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
> +
> +#endif /* dl-new-hash.h */


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-11  3:06   ` [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-16 14:12     ` Siddhesh Poyarekar
  2022-05-16 14:31       ` Alexander Monakov
  2022-05-16 18:09       ` Alexander Monakov
  0 siblings, 2 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-16 14:12 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha; +Cc: Alexander Monakov

On 11/05/2022 08:36, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    2.865,     2.72,               1.053
>   fixed,      1,    3.567,    2.489,               1.433
>   fixed,      2,    2.577,    3.649,               0.706
>   fixed,      3,    3.644,    5.983,               0.609
>   fixed,      4,    4.211,    6.833,               0.616
>   fixed,      5,    4.741,    9.372,               0.506
>   fixed,      6,    5.415,    9.561,               0.566
>   fixed,      7,    6.649,   10.789,               0.616
>   fixed,      8,    8.081,   11.808,               0.684
>   fixed,      9,    8.427,   12.935,               0.651
>   fixed,     10,    8.673,   14.134,               0.614
>   fixed,     11,    10.69,   15.408,               0.694
>   fixed,     12,   10.789,   16.982,               0.635
>   fixed,     13,   12.169,   18.411,               0.661
>   fixed,     14,   12.659,   19.914,               0.636
>   fixed,     15,   13.526,   21.541,               0.628
>   fixed,     16,   14.211,   23.088,               0.616
>   fixed,     32,   29.412,   52.722,               0.558
>   fixed,     64,    65.41,  142.351,               0.459
>   fixed,    128,  138.505,  295.625,               0.469
>   fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>   elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 61 insertions(+), 5 deletions(-)
> 
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 40d88c81f9..5269f6eb98 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,71 @@
>   #define _DL_NEW_HASH_H 1
>   
>   #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +_dl_new_hash (const char *s)
> +{
> +   uint32_t h = 5381;
> +   for (unsigned char c = *s; c != '\0'; c = *++s)
> +     h = h * 33 + c;
> +   return h;
> +}
> +
> +   We can get better performance, however, by slightly unrolling the
> +   loop and explicitly specifying order of operations to prevent
> +   reassosiation of instructions and ensure ideal scheduling.  */
>   
>   static inline uint32_t
>   __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *str)
>   {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Unlikely length zero string so evens will be slightly less
> +	 common.  */
> +      if (__glibc_unlikely (c0 == 0))
> +	return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +	{
> +	  c0 += h;
> +	  /* Ideal instruction scheduling is:
> +	 c0 += h;
> +	 h *= 32;
> +	 h += c0;
> +
> +	 The asm statements are to prevent reassosiation that would result in
> +	 more instruction interdependencies and worse scheduling.  */
> +	  asm("" : "+r"(h) : "r"(c0));

There are a couple of things that seem problematic to me about this:

- It seems like we're trying to fix a gcc issue in glibc.  Couldn't we 
file a gcc bug and explore ways in which this could be supported in the 
compiler?  In fact, it might make sense to do that for the original 
loop; it looks like a missed optimization that gcc ought to fix.  IMO 
the bug should be filed even if we do end up with this 
micro-optimization in glibc.

- The patch controls an instruction schedule so that it works well on 
out-of-order processors but then only quoting one microarchitecture.  If 
it works well on TigerLake (and on x86 in general) then it might be 
better to add it as a sysdep override; I assumed that was the point of 
breaking the function out into its header anyway.  If it is more 
generally useful then please share numbers to that effect in the commit 
message and also explicitly state in the comments why we're trying to 
exert this level of control on codegen in generic C code and why it is 
good for all architectures.

> +	  h = h * 32 + c0;
> +	  return h;
> +	}
> +
> +      /* Ideal instruction scheduling is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;
> +
> +	 The asm statements are to prevent reassosiation that would result in
> +	 more instruction interdependencies and worse scheduling.  */
> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
>   }
>   
>   

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 14:12     ` Siddhesh Poyarekar
@ 2022-05-16 14:31       ` Alexander Monakov
  2022-05-16 16:23         ` Siddhesh Poyarekar
  2022-05-16 18:09       ` Alexander Monakov
  1 sibling, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 14:31 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: Noah Goldstein, libc-alpha

On Mon, 16 May 2022, Siddhesh Poyarekar wrote:

> There are a couple of things that seem problematic to me about this:
> 
> - It seems like we're trying to fix a gcc issue in glibc.  Couldn't we file a
> gcc bug and explore ways in which this could be supported in the compiler?  In
> fact, it might make sense to do that for the original loop; it looks like a
> missed optimization that gcc ought to fix.  IMO the bug should be filed even
> if we do end up with this micro-optimization in glibc.

This issue involves a chain of dependencies that goes across all loop
iterations, but relevant compiler optimization (reassociation, register
allocation, scheduling) do not consider such global chains. You might
file this as a "wishlist" bug, but compiler infrastructure is simply
not designed to make such nontrivial decisions.

> - The patch controls an instruction schedule so that it works well on
> out-of-order processors but then only quoting one microarchitecture.

It's not specific to out-of-order processors: a long chain of dependencies
restricts OoO scheduling in the CPU. So in the end it benefits "classic"
and OoO pipelines in a similar fashion.

> If it
> works well on TigerLake (and on x86 in general) then it might be better to add
> it as a sysdep override; I assumed that was the point of breaking the function
> out into its header anyway.  If it is more generally useful then please share
> numbers to that effect in the commit message and also explicitly state in the
> comments why we're trying to exert this level of control on codegen in generic
> C code and why it is good for all architectures.

I guess it's up to you and Noah to hash it out, but I'd like to remind that
there was an alternative variant which is a strict win on all architectures
(same code size, same instruction mix, no dependency on fast multiplication).
That might be easier to justify from generic code point of view.

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 14:31       ` Alexander Monakov
@ 2022-05-16 16:23         ` Siddhesh Poyarekar
  2022-05-16 16:38           ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-16 16:23 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Noah Goldstein, libc-alpha

On 16/05/2022 20:01, Alexander Monakov wrote:
> On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> 
>> There are a couple of things that seem problematic to me about this:
>>
>> - It seems like we're trying to fix a gcc issue in glibc.  Couldn't we file a
>> gcc bug and explore ways in which this could be supported in the compiler?  In
>> fact, it might make sense to do that for the original loop; it looks like a
>> missed optimization that gcc ought to fix.  IMO the bug should be filed even
>> if we do end up with this micro-optimization in glibc.
> 
> This issue involves a chain of dependencies that goes across all loop
> iterations, but relevant compiler optimization (reassociation, register
> allocation, scheduling) do not consider such global chains. You might
> file this as a "wishlist" bug, but compiler infrastructure is simply
> not designed to make such nontrivial decisions.

Thanks for the context, this should go into comments.  A wishlist bug 
would be nice but I suspect it'll just gather dust.  Maybe it's still 
useful for someone coming in after 10-15 years looking for more context 
on it.

>> - The patch controls an instruction schedule so that it works well on
>> out-of-order processors but then only quoting one microarchitecture.
> 
> It's not specific to out-of-order processors: a long chain of dependencies
> restricts OoO scheduling in the CPU. So in the end it benefits "classic"
> and OoO pipelines in a similar fashion.
> 
>> If it
>> works well on TigerLake (and on x86 in general) then it might be better to add
>> it as a sysdep override; I assumed that was the point of breaking the function
>> out into its header anyway.  If it is more generally useful then please share
>> numbers to that effect in the commit message and also explicitly state in the
>> comments why we're trying to exert this level of control on codegen in generic
>> C code and why it is good for all architectures.
> 
> I guess it's up to you and Noah to hash it out, but I'd like to remind that
> there was an alternative variant which is a strict win on all architectures
> (same code size, same instruction mix, no dependency on fast multiplication).
> That might be easier to justify from generic code point of view.

I would prefer the earlier variant in generic code, with (if necessary) 
the scheduling hack being a sysdep for x86.  Other architectures that 
want to use the latter should #include it and also post microbenchmark 
results so that we keep track of how we arrived at that decision.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 16:23         ` Siddhesh Poyarekar
@ 2022-05-16 16:38           ` Noah Goldstein
  2022-05-16 16:44             ` Siddhesh Poyarekar
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 16:38 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: Alexander Monakov, GNU C Library

On Mon, May 16, 2022 at 11:23 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 16/05/2022 20:01, Alexander Monakov wrote:
> > On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> >
> >> There are a couple of things that seem problematic to me about this:
> >>
> >> - It seems like we're trying to fix a gcc issue in glibc.  Couldn't we file a
> >> gcc bug and explore ways in which this could be supported in the compiler?  In
> >> fact, it might make sense to do that for the original loop; it looks like a
> >> missed optimization that gcc ought to fix.  IMO the bug should be filed even
> >> if we do end up with this micro-optimization in glibc.
> >
> > This issue involves a chain of dependencies that goes across all loop
> > iterations, but relevant compiler optimization (reassociation, register
> > allocation, scheduling) do not consider such global chains. You might
> > file this as a "wishlist" bug, but compiler infrastructure is simply
> > not designed to make such nontrivial decisions.
>
> Thanks for the context, this should go into comments.  A wishlist bug
> would be nice but I suspect it'll just gather dust.  Maybe it's still
> useful for someone coming in after 10-15 years looking for more context
> on it.

I'll add a comment in the next version.
>
> >> - The patch controls an instruction schedule so that it works well on
> >> out-of-order processors but then only quoting one microarchitecture.
> >
> > It's not specific to out-of-order processors: a long chain of dependencies
> > restricts OoO scheduling in the CPU. So in the end it benefits "classic"
> > and OoO pipelines in a similar fashion.
> >
> >> If it
> >> works well on TigerLake (and on x86 in general) then it might be better to add
> >> it as a sysdep override; I assumed that was the point of breaking the function
> >> out into its header anyway.  If it is more generally useful then please share
> >> numbers to that effect in the commit message and also explicitly state in the
> >> comments why we're trying to exert this level of control on codegen in generic
> >> C code and why it is good for all architectures.
> >
> > I guess it's up to you and Noah to hash it out, but I'd like to remind that
> > there was an alternative variant which is a strict win on all architectures
> > (same code size, same instruction mix, no dependency on fast multiplication).
> > That might be easier to justify from generic code point of view.
>
> I would prefer the earlier variant in generic code, with (if necessary)
> the scheduling hack being a sysdep for x86.  Other architectures that
> want to use the latter should #include it and also post microbenchmark
> results so that we keep track of how we arrived at that decision.

I'm happy to switch it back but I don't think the scheduling hack is x86
oriented. I don't think re-ordering could ever de-optimize things.
The only real architectural assumption is a reasonably fast
32-bit multiply which is true for both the more generic earlier version
and the current one.
>
> Thanks,
> Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 16:38           ` Noah Goldstein
@ 2022-05-16 16:44             ` Siddhesh Poyarekar
  2022-05-16 20:32               ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-16 16:44 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Alexander Monakov, GNU C Library

On 16/05/2022 22:08, Noah Goldstein wrote:
>> Thanks for the context, this should go into comments.  A wishlist bug
>> would be nice but I suspect it'll just gather dust.  Maybe it's still
>> useful for someone coming in after 10-15 years looking for more context
>> on it.
> 
> I'll add a comment in the next version.

Thanks.

>> I would prefer the earlier variant in generic code, with (if necessary)
>> the scheduling hack being a sysdep for x86.  Other architectures that
>> want to use the latter should #include it and also post microbenchmark
>> results so that we keep track of how we arrived at that decision.
> 
> I'm happy to switch it back but I don't think the scheduling hack is x86
> oriented. I don't think re-ordering could ever de-optimize things.
> The only real architectural assumption is a reasonably fast
> 32-bit multiply which is true for both the more generic earlier version
> and the current one.

I don't entirely disagree, but I think the conservative stance here is 
to keep the scheduling hack in the sysdep that has actually been tested 
and then bring it out into generic if it has been experimentally 
verified to be a universal win for all architectures we support.  That 
is, if we find that every architecture is including the sysdep version, 
then it's time to bring it out to replace the generic version.

FWIW, I'm lowering the bar for acceptance because you only need to 
verify that the scheduling hack is better for the architecture you're 
interested in, not all architectures we support :)

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 14:12     ` Siddhesh Poyarekar
  2022-05-16 14:31       ` Alexander Monakov
@ 2022-05-16 18:09       ` Alexander Monakov
  2022-05-16 18:47         ` Siddhesh Poyarekar
  1 sibling, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 18:09 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: Noah Goldstein, libc-alpha

On Mon, 16 May 2022, Siddhesh Poyarekar wrote:

> There are a couple of things that seem problematic to me about this:
[snip]

Since Carlos mentioned in today's patch review message that you didn't like
something about the asms, can you (or Adhemerval) please explain what you
meant? The asms in the patch have empty body and standard "r" constraints,
I'd say they are perfectly portable.

Personally I am highly uncomfortable with situations when maintainers have
a substantial (in their opinion) objection to the patch, but the objection
is not communicated back to the submitter, so they cannot defend their patch
or even learn about the issue to account for it in future work, but at the
same time it is discussed "behind the submitter's back" (potentially
disparaging or merely misunderstanding their work).

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 18:09       ` Alexander Monakov
@ 2022-05-16 18:47         ` Siddhesh Poyarekar
  2022-05-16 19:28           ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-16 18:47 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Noah Goldstein, libc-alpha

On 16/05/2022 23:39, Alexander Monakov wrote:
> On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> 
>> There are a couple of things that seem problematic to me about this:
> [snip]
> 
> Since Carlos mentioned in today's patch review message that you didn't like
> something about the asms, can you (or Adhemerval) please explain what you
> meant? The asms in the patch have empty body and standard "r" constraints,
> I'd say they are perfectly portable.

I did explain; I am not comfortable with controlling instruction 
scheduling in that manner for generic code because it assumes more about 
underlying processor pipelines and instruction sequences than we 
typically do in generic code.  It has nothing to do with portability. 
Adhemerval raised the question about whether this ought to be done in 
gcc instead, which I concurred with too.

I volunteered to share that position here, which I believe I did to the 
best of my knowledge and understanding of the issues.  I think talking 
about "the asms" would have only confused the issue since it's not 
really about that construct.

> Personally I am highly uncomfortable with situations when maintainers have
> a substantial (in their opinion) objection to the patch, but the objection
> is not communicated back to the submitter, so they cannot defend their patch
> or even learn about the issue to account for it in future work, but at the
> same time it is discussed "behind the submitter's back" (potentially
> disparaging or merely misunderstanding their work).

The submitter (Noah) has often attended the review calls so it's just 
unfortunate that he didn't happen to be on it today.

The main point of this call is finding reviewers for patches and the 
secondary one is for submitters to solicit reviews or discussions if 
they're stuck on a problem.  The call is open to everyone too[1].  While 
some patches get closer scrutiny, the primary purpose is to find out why 
the patch is stuck in review, not to decide on whether to ack or nack 
it.  Usually the submitter is present in the closer scrutiny but if not, 
the action item always is for one of us to volunteer to take a closer 
look and share findings on the mailing list.

Everything, without exception, lands on the mailing list to the last 
detail.  Except maybe jokes about someone's audio not working or 
conversations about the weather.

Siddhesh

[1] https://sourceware.org/glibc/wiki/PatchworkReviewMeetings

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 18:47         ` Siddhesh Poyarekar
@ 2022-05-16 19:28           ` Alexander Monakov
  2022-05-16 19:35             ` Noah Goldstein
  2022-05-17  1:45             ` Siddhesh Poyarekar
  0 siblings, 2 replies; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 19:28 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: Noah Goldstein, libc-alpha

On Tue, 17 May 2022, Siddhesh Poyarekar wrote:

> On 16/05/2022 23:39, Alexander Monakov wrote:
> > On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> > 
> >> There are a couple of things that seem problematic to me about this:
> > [snip]
> > 
> > Since Carlos mentioned in today's patch review message that you didn't like
> > something about the asms, can you (or Adhemerval) please explain what you
> > meant? The asms in the patch have empty body and standard "r" constraints,
> > I'd say they are perfectly portable.
> 
> I did explain; I am not comfortable with controlling instruction scheduling in
> that manner for generic code because it assumes more about underlying
> processor pipelines and instruction sequences than we typically do in generic
> code.  It has nothing to do with portability. Adhemerval raised the question
> about whether this ought to be done in gcc instead, which I concurred with
> too.

Thank you very much for the detailed response. Allow me to clear up what seems
to be a technical misunderstanding here: this is not about instruction
scheduling, but rather dependencies in the computations (I know Noah mentioned
scheduling, but it's confusing especially in context of benchmarking for an
out-of-order CPU).

I have shown how different variants have different chains of dependencies in
this email: https://sourceware.org/pipermail/libc-alpha/2022-May/138495.html

The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
the dependency graph in context of the whole loop.

There's nothing specific to the x86 architecture in this reasoning. On arm and
aarch64 it's moot because they evaluate 'h*32 + h' in a single cycle, though.

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:28           ` Alexander Monakov
@ 2022-05-16 19:35             ` Noah Goldstein
  2022-05-16 19:41               ` Alexander Monakov
  2022-05-17  1:45             ` Siddhesh Poyarekar
  1 sibling, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 19:35 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Siddhesh Poyarekar, GNU C Library

On Mon, May 16, 2022 at 2:28 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Tue, 17 May 2022, Siddhesh Poyarekar wrote:
>
> > On 16/05/2022 23:39, Alexander Monakov wrote:
> > > On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> > >
> > >> There are a couple of things that seem problematic to me about this:
> > > [snip]
> > >
> > > Since Carlos mentioned in today's patch review message that you didn't like
> > > something about the asms, can you (or Adhemerval) please explain what you
> > > meant? The asms in the patch have empty body and standard "r" constraints,
> > > I'd say they are perfectly portable.
> >
> > I did explain; I am not comfortable with controlling instruction scheduling in
> > that manner for generic code because it assumes more about underlying
> > processor pipelines and instruction sequences than we typically do in generic
> > code.  It has nothing to do with portability. Adhemerval raised the question
> > about whether this ought to be done in gcc instead, which I concurred with
> > too.
>
> Thank you very much for the detailed response. Allow me to clear up what seems
> to be a technical misunderstanding here: this is not about instruction
> scheduling, but rather dependencies in the computations (I know Noah mentioned
> scheduling, but it's confusing especially in context of benchmarking for an
> out-of-order CPU).
>
> I have shown how different variants have different chains of dependencies in
> this email: https://sourceware.org/pipermail/libc-alpha/2022-May/138495.html
>
> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> the dependency graph in context of the whole loop.

Some architecture could have a really fast integer MADD instruction that
the barrier could either prevent from being emitted or add an extra ADD
instruction at the end of.

Think most arch should use the barriers but it's fair to leave that up to
the arch maintainer.


>
> There's nothing specific to the x86 architecture in this reasoning. On arm and
> aarch64 it's moot because they evaluate 'h*32 + h' in a single cycle, though.
>
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:35             ` Noah Goldstein
@ 2022-05-16 19:41               ` Alexander Monakov
  2022-05-16 19:47                 ` Adhemerval Zanella
  2022-05-16 19:48                 ` Noah Goldstein
  0 siblings, 2 replies; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 19:41 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Siddhesh Poyarekar, GNU C Library

On Mon, 16 May 2022, Noah Goldstein wrote:

> > The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> > to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> > the dependency graph in context of the whole loop.
> 
> Some architecture could have a really fast integer MADD instruction that
> the barrier could either prevent from being emitted or add an extra ADD
> instruction at the end of.

With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
decides to emit madd vs. 2 cycles if it's only additions.

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:41               ` Alexander Monakov
@ 2022-05-16 19:47                 ` Adhemerval Zanella
  2022-05-16 20:00                   ` Alexander Monakov
  2022-05-16 19:48                 ` Noah Goldstein
  1 sibling, 1 reply; 167+ messages in thread
From: Adhemerval Zanella @ 2022-05-16 19:47 UTC (permalink / raw)
  To: Alexander Monakov, Noah Goldstein; +Cc: GNU C Library



On 16/05/2022 16:41, Alexander Monakov via Libc-alpha wrote:
> On Mon, 16 May 2022, Noah Goldstein wrote:
> 
>>> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
>>> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
>>> the dependency graph in context of the whole loop.
>>
>> Some architecture could have a really fast integer MADD instruction that
>> the barrier could either prevent from being emitted or add an extra ADD
>> instruction at the end of.
> 
> With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> decides to emit madd vs. 2 cycles if it's only additions.
> 
> Alexander

How hard would to make compiler to make this very optimization? I raised 
this on weekly call because more and more it seems that tuning computation
dependencies for loop tuning seems to be more a compiler job than libc's
(although this not a blocker, but we have multiple smalls micro-optimizations
in the past that turned in dead code due compiler catching up).

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:41               ` Alexander Monakov
  2022-05-16 19:47                 ` Adhemerval Zanella
@ 2022-05-16 19:48                 ` Noah Goldstein
  2022-05-16 20:33                   ` Alexander Monakov
  1 sibling, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 19:48 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Siddhesh Poyarekar, GNU C Library

On Mon, May 16, 2022 at 2:41 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Mon, 16 May 2022, Noah Goldstein wrote:
>
> > > The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> > > to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> > > the dependency graph in context of the whole loop.
> >
> > Some architecture could have a really fast integer MADD instruction that
> > the barrier could either prevent from being emitted or add an extra ADD
> > instruction at the end of.
>
> With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> decides to emit madd vs. 2 cycles if it's only additions.

AFAIK the shift-by-5 + 2x adds is the best that exists at the moment but
the point is some arch we either arent thinking about or some future variant
may implement a leq 2 cycle madd.
>
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:47                 ` Adhemerval Zanella
@ 2022-05-16 20:00                   ` Alexander Monakov
  2022-05-16 20:08                     ` Adhemerval Zanella
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 20:00 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Noah Goldstein, GNU C Library

> On 16/05/2022 16:41, Alexander Monakov via Libc-alpha wrote:
> > On Mon, 16 May 2022, Noah Goldstein wrote:
> > 
> >>> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> >>> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> >>> the dependency graph in context of the whole loop.
> >>
> >> Some architecture could have a really fast integer MADD instruction that
> >> the barrier could either prevent from being emitted or add an extra ADD
> >> instruction at the end of.
> > 
> > With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> > aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> > decides to emit madd vs. 2 cycles if it's only additions.
> > 
> > Alexander
> 
> How hard would to make compiler to make this very optimization? I raised 
> this on weekly call because more and more it seems that tuning computation
> dependencies for loop tuning seems to be more a compiler job than libc's
> (although this not a blocker, but we have multiple smalls micro-optimizations
> in the past that turned in dead code due compiler catching up).

Sorry, since you're responding to a discussion about multiply-add, it's unclear
to me which optimization you mean. Is your question about choosing which
sequence of additions has shorter cross-iteration chain?

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 20:00                   ` Alexander Monakov
@ 2022-05-16 20:08                     ` Adhemerval Zanella
  2022-05-16 20:27                       ` Alexander Monakov
  0 siblings, 1 reply; 167+ messages in thread
From: Adhemerval Zanella @ 2022-05-16 20:08 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Noah Goldstein, GNU C Library



On 16/05/2022 17:00, Alexander Monakov wrote:
>> On 16/05/2022 16:41, Alexander Monakov via Libc-alpha wrote:
>>> On Mon, 16 May 2022, Noah Goldstein wrote:
>>>
>>>>> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
>>>>> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
>>>>> the dependency graph in context of the whole loop.
>>>>
>>>> Some architecture could have a really fast integer MADD instruction that
>>>> the barrier could either prevent from being emitted or add an extra ADD
>>>> instruction at the end of.
>>>
>>> With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
>>> aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
>>> decides to emit madd vs. 2 cycles if it's only additions.
>>>
>>> Alexander
>>
>> How hard would to make compiler to make this very optimization? I raised 
>> this on weekly call because more and more it seems that tuning computation
>> dependencies for loop tuning seems to be more a compiler job than libc's
>> (although this not a blocker, but we have multiple smalls micro-optimizations
>> in the past that turned in dead code due compiler catching up).
> 
> Sorry, since you're responding to a discussion about multiply-add, it's unclear
> to me which optimization you mean. Is your question about choosing which
> sequence of additions has shorter cross-iteration chain?

Indeed I was not clear, I mean the reply to [1] where you explain why 
you have suggested the asm to prevent compiler reassociating.  

[1] https://sourceware.org/pipermail/libc-alpha/2022-May/138794.html

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 20:08                     ` Adhemerval Zanella
@ 2022-05-16 20:27                       ` Alexander Monakov
  0 siblings, 0 replies; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 20:27 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: GNU C Library

On Mon, 16 May 2022, Adhemerval Zanella via Libc-alpha wrote:

> >> How hard would to make compiler to make this very optimization? I raised 
> >> this on weekly call because more and more it seems that tuning computation
> >> dependencies for loop tuning seems to be more a compiler job than libc's
> >> (although this not a blocker, but we have multiple smalls micro-optimizations
> >> in the past that turned in dead code due compiler catching up).
> > 
> > Sorry, since you're responding to a discussion about multiply-add, it's unclear
> > to me which optimization you mean. Is your question about choosing which
> > sequence of additions has shorter cross-iteration chain?
> 
> Indeed I was not clear, I mean the reply to [1] where you explain why 
> you have suggested the asm to prevent compiler reassociating.  
> 
> [1] https://sourceware.org/pipermail/libc-alpha/2022-May/138794.html

I think it's pretty hard, you'd have to decompose 'h*33' into '(h<<5)+h'
in the reassociation pass, notice that it's a part of addition chain that
feeds the phi node for 'h', and based on that select a specific
association variant (all to shave off one cycle per iteration). To me it
looks like an optimization just for this exact scenario. And then you
need to "hope" that no other pass undoes this transformation.

It would be quite some nontrivial code in the compiler, when the alternative is
getting a guaranteed outcome for any compiler by adding an empty asm statement
in a loop that iterates thousands of times on every process startup.

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (14 preceding siblings ...)
  2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
@ 2022-05-16 20:29 ` Noah Goldstein
  2022-05-16 20:30   ` [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (5 more replies)
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
  17 siblings, 6 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:29 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..b7a91ecc07
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,37 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline.  */
+#include <sys/cdefs.h>
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
@ 2022-05-16 20:30   ` Noah Goldstein
  2022-05-17  4:19     ` Siddhesh Poyarekar
  2022-05-16 20:30   ` [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:30 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile      |   1 +
 elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index fc9860edee..0e72f913a0 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -309,6 +309,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..e806a274ca
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+/* Simple implementation of ELF ABI hash function. */
+
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+static unsigned int
+simple_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+static unsigned int
+simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		  res);
+    }
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	{
+	  v = 1;
+	}
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    {
+      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
+    {
+      return 1;
+    }
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
  2022-05-16 20:30   ` [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-16 20:30   ` Noah Goldstein
  2022-05-17  4:32     ` Siddhesh Poyarekar
  2022-05-16 20:30   ` [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:30 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile       |   1 +
 nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..6bb2ce06ab
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,105 @@
+/* Test __nss_hash
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+/* Simplist implementation of __nss_hash. */
+static uint32_t
+simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    {
+      h = *key++ + 65599 * h;
+    }
+  return h;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+    }
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    {
+      buf[i] = random ();
+    }
+
+  expec = simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    {
+      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    {
+	      return 1;
+	    }
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    {
+	      return 1;
+	    }
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
  2022-05-16 20:30   ` [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-16 20:30   ` [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-16 20:30   ` Noah Goldstein
  2022-05-17  4:52     ` Siddhesh Poyarekar
  2022-05-16 20:30   ` [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:30 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile            |  25 ++++-
 benchtests/README              |   9 +-
 benchtests/bench-dl-elf-hash.c |  23 ++++
 benchtests/bench-dl-new-hash.c |  23 ++++
 benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c    |  24 ++++
 6 files changed, 292 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..5ca5116ad3
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define TEST_NAME "_dl_elf_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..f5be528960
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,23 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..85cf7de8bc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,196 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 256
+};
+
+static double __attribute__ ((noinline, noclone))
+do_one_test_kernel (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+static double
+do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..085e1f8ee2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,24 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#define TEST_FUNC __nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-16 20:30   ` [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-16 20:30   ` Noah Goldstein
  2022-05-17  5:11     ` Siddhesh Poyarekar
  2022-05-16 20:30   ` [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-05-17  3:34   ` [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:30 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..c6a375f386 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-16 20:30   ` [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-16 20:30   ` Noah Goldstein
  2022-05-17  5:12     ` Siddhesh Poyarekar
  2022-05-17  3:34   ` [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:30 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. The unrolling allows for
pipelined multiplies.

As well, as an optional sysdep, reorder the operations and prevent
reassosiation for better scheduling and higher ILP. This commit
only adds the barrier for x86, although it should be either no
change or a win for any architecture.

Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 sysdeps/generic/dl-new-hash.h      | 114 +++++++++++++++++++++++++++++
 {elf => sysdeps/x86}/dl-new-hash.h |  16 +---
 2 files changed, 117 insertions(+), 13 deletions(-)
 create mode 100644 sysdeps/generic/dl-new-hash.h
 rename {elf => sysdeps/x86}/dl-new-hash.h (77%)

diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
new file mode 100644
index 0000000000..84aa7991a4
--- /dev/null
+++ b/sysdeps/generic/dl-new-hash.h
@@ -0,0 +1,114 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline.  */
+#include <sys/cdefs.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+   _dl_new_hash (const char *s)
+   {
+      uint32_t h = 5381;
+      for (unsigned char c = *s; c != '\0'; c = *++s)
+        h = h * 33 + c;
+      return h;
+   }
+
+   We can get better performance by slightly unrolling the
+   loop to pipeline the multiples.
+
+   As well, as an architecture specific option we add asm statements
+   to explicitly specifying order of operations to prevent
+   reassosiation of instructions that lengthens the loop carried
+   dependency. This may have no affect as the compiler may have
+   ordered instructions the same way without it but in testing this
+   has not been the case for GCC. Improving GCC to reliably schedule
+   instructions ideally cannot be easily done.
+
+   Architecture(s) that use the reassosiation barries are:
+   x86
+
+   Note it is very unlikely the reassosiation barriers would
+   de-optimize performance on any archictecture and with an imperfect
+   compiler it may help performance, especially on out-of-order cpus,
+   so it is suggested that the respective maintainers add them.  */
+
+
+#ifndef __asm_reassociation_barrier
+# define __asm_reassociation_barrier(...)
+#endif
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *str)
+{
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = s[0];
+      /* Since hashed string is normally not empty, this is unlikely on the
+	 first iteration of the loop.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = s[1];
+      if (c1 == 0)
+	{
+	  /* Ideal instruction scheduling is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;
+
+	 The __asm_reassociation_barrier() macro is a sysdep optional asm
+	 statements to prevents reassosiation that would result in more
+	 instruction interdependencies and worse scheduling.  */
+	  c0 += h;
+	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+
+	 The __asm_reassociation_barrier() macro is a sysdep optional asm
+	 statements to prevents reassosiation that would result in more
+	 instruction interdependencies and worse scheduling.  */
+      c1 += c0;
+      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      __asm_reassociation_barrier("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
+}
+
+#endif /* dl-new-hash.h */
diff --git a/elf/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
similarity index 77%
rename from elf/dl-new-hash.h
rename to sysdeps/x86/dl-new-hash.h
index b7a91ecc07..dd800265bf 100644
--- a/elf/dl-new-hash.h
+++ b/sysdeps/x86/dl-new-hash.h
@@ -19,19 +19,9 @@
 #ifndef _DL_NEW_HASH_H
 #define _DL_NEW_HASH_H 1
 
-#include <stdint.h>
-/* For __always_inline.  */
-#include <sys/cdefs.h>
-
-static __always_inline uint32_t
-__attribute__ ((unused))
-_dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
+#define __asm_reassociation_barrier __asm__
 
+#undef _DL_NEW_HASH_H
+#include <sysdeps/generic/dl-new-hash.h>
 
 #endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-16 13:56   ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
@ 2022-05-16 20:31     ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:31 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Mon, May 16, 2022 at 8:57 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 11/05/2022 08:36, Noah Goldstein via Libc-alpha wrote:
> > No change to the code other than moving the function to
> > dl-new-hash.h. Changed name so its now in the reserved namespace.
> > ---
> >   elf/dl-lookup.c   | 13 ++-----------
> >   elf/dl-new-hash.h | 35 +++++++++++++++++++++++++++++++++++
> >   2 files changed, 37 insertions(+), 11 deletions(-)
> >   create mode 100644 elf/dl-new-hash.h
> >
> > diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> > index 989b073e4f..a42f6d5390 100644
> > --- a/elf/dl-lookup.c
> > +++ b/elf/dl-lookup.c
> > @@ -24,6 +24,7 @@
> >   #include <ldsodefs.h>
> >   #include <dl-hash.h>
> >   #include <dl-machine.h>
> > +#include <dl-new-hash.h>
> >   #include <dl-protected.h>
> >   #include <sysdep-cancel.h>
> >   #include <libc-lock.h>
> > @@ -558,16 +559,6 @@ skip:
> >   }
> >
> >
> > -static uint32_t
> > -dl_new_hash (const char *s)
> > -{
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > -}
> > -
> > -
> >   /* Add extra dependency on MAP to UNDEF_MAP.  */
> >   static int
> >   add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> > @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
> >                    const struct r_found_version *version,
> >                    int type_class, int flags, struct link_map *skip_map)
> >   {
> > -  const unsigned int new_hash = dl_new_hash (undef_name);
> > +  const unsigned int new_hash = _dl_new_hash (undef_name);
> >     unsigned long int old_hash = 0xffffffff;
> >     struct sym_val current_value = { NULL, NULL };
> >     struct r_scope_elem **scope = symbol_scope;
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..40d88c81f9
> > --- /dev/null
> > +++ b/elf/dl-new-hash.h
> > @@ -0,0 +1,35 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +
> > +static inline uint32_t
>
> This has only one caller and ISTM that it's always desirable to inline
> this, so use __always_inline instead.

Fixed in v9.
>
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *s)
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
> > +
> > +
> > +#endif /* dl-new-hash.h */
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 16:44             ` Siddhesh Poyarekar
@ 2022-05-16 20:32               ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 20:32 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: Alexander Monakov, GNU C Library

On Mon, May 16, 2022 at 11:44 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 16/05/2022 22:08, Noah Goldstein wrote:
> >> Thanks for the context, this should go into comments.  A wishlist bug
> >> would be nice but I suspect it'll just gather dust.  Maybe it's still
> >> useful for someone coming in after 10-15 years looking for more context
> >> on it.
> >
> > I'll add a comment in the next version.
>
> Thanks.
>
> >> I would prefer the earlier variant in generic code, with (if necessary)
> >> the scheduling hack being a sysdep for x86.  Other architectures that
> >> want to use the latter should #include it and also post microbenchmark
> >> results so that we keep track of how we arrived at that decision.
> >
> > I'm happy to switch it back but I don't think the scheduling hack is x86
> > oriented. I don't think re-ordering could ever de-optimize things.
> > The only real architectural assumption is a reasonably fast
> > 32-bit multiply which is true for both the more generic earlier version
> > and the current one.
>
> I don't entirely disagree, but I think the conservative stance here is
> to keep the scheduling hack in the sysdep that has actually been tested
> and then bring it out into generic if it has been experimentally
> verified to be a universal win for all architectures we support.  That
> is, if we find that every architecture is including the sysdep version,
> then it's time to bring it out to replace the generic version.
>
> FWIW, I'm lowering the bar for acceptance because you only need to
> verify that the scheduling hack is better for the architecture you're
> interested in, not all architectures we support :)

Made the asm statements arch specific in V9.

>
> Thanks,
> Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:48                 ` Noah Goldstein
@ 2022-05-16 20:33                   ` Alexander Monakov
  2022-05-16 21:40                     ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Alexander Monakov @ 2022-05-16 20:33 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Siddhesh Poyarekar, GNU C Library

On Mon, 16 May 2022, Noah Goldstein wrote:

> > With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> > aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> > decides to emit madd vs. 2 cycles if it's only additions.
> 
> AFAIK the shift-by-5 + 2x adds is the best that exists at the moment but
> the point is some arch we either arent thinking about or some future variant
> may implement a leq 2 cycle madd.

Maybe they'll manage to fit two dependent additions in a single cycle like it
was on Pentium 4, too ;)

Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 20:33                   ` Alexander Monakov
@ 2022-05-16 21:40                     ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-16 21:40 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Siddhesh Poyarekar, GNU C Library

On Mon, May 16, 2022 at 3:33 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Mon, 16 May 2022, Noah Goldstein wrote:
>
> > > With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> > > aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> > > decides to emit madd vs. 2 cycles if it's only additions.
> >
> > AFAIK the shift-by-5 + 2x adds is the best that exists at the moment but
> > the point is some arch we either arent thinking about or some future variant
> > may implement a leq 2 cycle madd.
>
> Maybe they'll manage to fit two dependent additions in a single cycle like it
> was on Pentium 4, too ;)

Yes but the barriers preclude the MADD, but doesn't inherently preclude 2x
dependent ADDs. I think the barrier code is better and the comments/commit
message recommend arch maintainers to use it, but think there is a fair
case that this is arch dependent.
>
> Alexander

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 19:28           ` Alexander Monakov
  2022-05-16 19:35             ` Noah Goldstein
@ 2022-05-17  1:45             ` Siddhesh Poyarekar
  1 sibling, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  1:45 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Noah Goldstein, libc-alpha

On 17/05/2022 00:58, Alexander Monakov wrote:
>> I did explain; I am not comfortable with controlling instruction scheduling in
>> that manner for generic code because it assumes more about underlying
>> processor pipelines and instruction sequences than we typically do in generic
>> code.  It has nothing to do with portability. Adhemerval raised the question
>> about whether this ought to be done in gcc instead, which I concurred with
>> too.
> 
> Thank you very much for the detailed response. Allow me to clear up what seems
> to be a technical misunderstanding here: this is not about instruction
> scheduling, but rather dependencies in the computations (I know Noah mentioned
> scheduling, but it's confusing especially in context of benchmarking for an
> out-of-order CPU).
> 
> I have shown how different variants have different chains of dependencies in
> this email: https://sourceware.org/pipermail/libc-alpha/2022-May/138495.html

Agreed, but again the latencies due to that dependency graph may have 
more or less impact depending on the architecture and eventually is 
linked to the code schedule, so the difference is academic IMO.

> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> the dependency graph in context of the whole loop.
> 
> There's nothing specific to the x86 architecture in this reasoning. On arm and
> aarch64 it's moot because they evaluate 'h*32 + h' in a single cycle, though.

That may well be true, but there are always architecture quirks to throw 
one off that may have been missed in testing or may turn up later.  Like 
I conceded before, it may well be that my concerns are unfounded and 
that gcc will generate the best code for all architectures with those 
barriers in place but that choice should be explicitly made based on 
benchmarks.

Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-05-16 20:30   ` [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-17  3:34   ` Siddhesh Poyarekar
  2022-05-18 17:28     ` Noah Goldstein
  5 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  3:34 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 17/05/2022 01:59, Noah Goldstein via Libc-alpha wrote:
> No change to the code other than moving the function to
> dl-new-hash.h. Changed name so its now in the reserved namespace.
> ---
>   elf/dl-lookup.c   | 13 ++-----------
>   elf/dl-new-hash.h | 37 +++++++++++++++++++++++++++++++++++++
>   2 files changed, 39 insertions(+), 11 deletions(-)
>   create mode 100644 elf/dl-new-hash.h

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> index 989b073e4f..a42f6d5390 100644
> --- a/elf/dl-lookup.c
> +++ b/elf/dl-lookup.c
> @@ -24,6 +24,7 @@
>   #include <ldsodefs.h>
>   #include <dl-hash.h>
>   #include <dl-machine.h>
> +#include <dl-new-hash.h>
>   #include <dl-protected.h>
>   #include <sysdep-cancel.h>
>   #include <libc-lock.h>
> @@ -558,16 +559,6 @@ skip:
>   }
>   
>   
> -static uint32_t
> -dl_new_hash (const char *s)
> -{
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> -}
> -
> -
>   /* Add extra dependency on MAP to UNDEF_MAP.  */
>   static int
>   add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
>   		     const struct r_found_version *version,
>   		     int type_class, int flags, struct link_map *skip_map)
>   {
> -  const unsigned int new_hash = dl_new_hash (undef_name);
> +  const unsigned int new_hash = _dl_new_hash (undef_name);
>     unsigned long int old_hash = 0xffffffff;
>     struct sym_val current_value = { NULL, NULL };
>     struct r_scope_elem **scope = symbol_scope;
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> new file mode 100644
> index 0000000000..b7a91ecc07
> --- /dev/null
> +++ b/elf/dl-new-hash.h
> @@ -0,0 +1,37 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
> +
> +#endif /* dl-new-hash.h */


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-16 20:30   ` [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-17  4:19     ` Siddhesh Poyarekar
  2022-05-18 17:29       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  4:19 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> If we want to further optimize the functions tests are needed.
> ---
>   elf/Makefile      |   1 +
>   elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 148 insertions(+)
>   create mode 100644 elf/tst-dl-hash.c
> 
> diff --git a/elf/Makefile b/elf/Makefile
> index fc9860edee..0e72f913a0 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -309,6 +309,7 @@ tests := \
>     tst-array4 \
>     tst-array5 \
>     tst-auxv \
> +  tst-dl-hash \
>     tst-leaks1 \
>     tst-stringtable \
>     tst-tls9 \
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> new file mode 100644
> index 0000000000..e806a274ca
> --- /dev/null
> +++ b/elf/tst-dl-hash.c
> @@ -0,0 +1,147 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +/* Simple implementation of ELF ABI hash function. */

The one line description is typically the first line at the top, just 
before the copyright notice.  And perhaps you want to call it "Test ELF 
ABI hash functions" or something similar :)

> +
> +#include <dl-hash.h>
> +#include <dl-new-hash.h>
> +#include <support/support.h>
> +#include <support/check.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +typedef unsigned int (*hash_f) (const char *);
> +
> +static unsigned int
> +simple_dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}

Maybe just `#define dl_new_hash simple_dl_new_hash` and include 
elf/dl-new-hash.h here?  And then don't get rid of elf/dl-new-hash.h in 
6/6, let it remain the reference implementation to test against. 
Perhaps also add a comment in that file stating that it is a reference 
implementation to test against and that sysdeps has the actual 
implementation that gets used, depending on the target.

> +
> +static unsigned int
> +simple_dl_elf_hash (const char *name_arg)
> +{
> +  unsigned long int hash = 0;
> +  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
> +    {
> +      unsigned long int hi;
> +      hash = (hash << 4) + c;
> +      hi = hash & 0xf0000000;
> +      hash ^= hi >> 24;
> +      hash &= 0x0fffffff;
> +    }
> +  return hash;
> +}

Likewise, add elf/dl-hash.h with this reference implementation.

> +static int
> +do_fill_test (size_t len, int fill, const char *name, hash_f testf,
> +	      hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  char buf[len + 1];
> +  memset (buf, fill, len);
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    {
> +      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
> +		  res);
> +    }
> +
> +  return 0;
> +}
> +
> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
> +		    &simple_dl_new_hash))
> +    {

Redundant paranthesis.

> +      return 1;
> +    }
> +  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
> +		       &simple_dl_elf_hash);
> +}
> +
> +static int
> +do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  size_t i;
> +  char buf[len + 1];
> +  char v;
> +  for (i = 0; i < len; ++i)
> +    {
> +      v = random ();
> +      if (v == 0)
> +	{

Likewise.

> +	  v = 1;
> +	}
> +      buf[i] = v;
> +    }
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    {
> +      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
> +      return 1;
> +    }
> +
> +  return 0;
> +}
> +
> +static int
> +do_rand_tests (size_t len)
> +{
> +  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
> +    {

Likewise.

> +      return 1;
> +    }
> +  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
> +}
> +
> +static int
> +do_test (void)
> +{
> +  size_t i, j;
> +  for (i = 0; i < 100; ++i)
> +    {
> +      for (j = 0; j < 8192; ++j)
> +	{
> +	  if (do_rand_tests (i))
> +	    {

Likewise.

> +	      return 1;
> +	    }
> +
> +	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> +	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> +	    {

Likewise.

> +	      return 1;
> +	    }
> +	}
> +    }
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-16 20:30   ` [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-17  4:32     ` Siddhesh Poyarekar
  2022-05-18 17:30       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  4:32 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> If we want to further optimize the function tests are needed.
> ---
>   nss/Makefile       |   1 +
>   nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 106 insertions(+)
>   create mode 100644 nss/tst-nss-hash.c
> 
> diff --git a/nss/Makefile b/nss/Makefile
> index d8b06b44fb..a978e3927a 100644
> --- a/nss/Makefile
> +++ b/nss/Makefile
> @@ -62,6 +62,7 @@ tests := \
>     test-digits-dots \
>     test-netdb \
>     tst-nss-getpwent \
> +  tst-nss-hash \
>     tst-nss-test1 \
>     tst-nss-test2 \
>     tst-nss-test4 \
> diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
> new file mode 100644
> index 0000000000..6bb2ce06ab
> --- /dev/null
> +++ b/nss/tst-nss-hash.c
> @@ -0,0 +1,105 @@
> +/* Test __nss_hash
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.

New file, so only 2022?

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <support/support.h>
> +#include <support/check.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +#include <nss.h>
> +
> +uint32_t __nss_hash (const void *__key, size_t __length);
> +
> +/* Simplist implementation of __nss_hash. */
> +static uint32_t
> +simple_nss_hash (const void *keyarg, size_t len)
> +{
> +  const unsigned char *key;
> +  size_t i;
> +  uint32_t h = 0;
> +  key = keyarg;
> +
> +  for (i = 0; i < len; ++i)
> +    {
> +      h = *key++ + 65599 * h;
> +    }
> +  return h;
> +}

Same as dl-hash, it may make sense to maintain this in elf/nss-hash.c:

#ifdef __nss_hash
/* Describe the reference macro */
static uint32_t
__nss_hash (const void *keyarg, size_t len)
{
   const unsigned char *key;
   size_t i;
   uint32_t h = 0;
   key = keyarg;

   for (i = 0; i < len; ++i)
     {
       h = *key++ + 65599 * h;
     }
   return h;
}
#else
static uint32_t
__nss_hash (const void *keyarg, size_t len)
{
   ... the current implementation...
}
#endif

It can then be included like so:

#define __nss_hash simple_nss_hash
#include "nss_hash.c"

> +
> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  uint32_t expec, res;
> +  char buf[len];
> +  memset (buf, fill, len);
> +
> +  expec = simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    {

Redundant paranthesis.

> +      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
> +    }
> +
> +  return 0;
> +}
> +
> +static int
> +do_rand_tests (size_t len)
> +{
> +  uint32_t expec, res;
> +  size_t i;
> +  char buf[len];
> +  for (i = 0; i < len; ++i)
> +    {

Redundant paranthesis.

> +      buf[i] = random ();
> +    }
> +
> +  expec = simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    {
> +      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
> +      return 1;
> +    }
> +
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  size_t i, j;
> +  for (i = 0; i < 100; ++i)
> +    {
> +      for (j = 0; j < 8192; ++j)
> +	{
> +	  if (do_rand_tests (i))
> +	    {

Redundant paranthesis.

> +	      return 1;
> +	    }
> +	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> +	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> +	    {

Redundant paranthesis.

> +	      return 1;
> +	    }
> +	}
> +    }
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-16 20:30   ` [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-17  4:52     ` Siddhesh Poyarekar
  2022-05-18 17:33       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  4:52 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> Benchtests are for throughput and include random / fixed size
> benchmarks.
> ---
>   benchtests/Makefile            |  25 ++++-
>   benchtests/README              |   9 +-
>   benchtests/bench-dl-elf-hash.c |  23 ++++
>   benchtests/bench-dl-new-hash.c |  23 ++++
>   benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
>   benchtests/bench-nss-hash.c    |  24 ++++
>   6 files changed, 292 insertions(+), 8 deletions(-)
>   create mode 100644 benchtests/bench-dl-elf-hash.c
>   create mode 100644 benchtests/bench-dl-new-hash.c
>   create mode 100644 benchtests/bench-hash-funcs.c
>   create mode 100644 benchtests/bench-nss-hash.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index de9de5cf58..c279041e19 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -227,6 +227,12 @@ LOCALES := \
>   include ../gen-locales.mk
>   endif
>   
> +hash-benchset := \
> +  dl-elf-hash \
> +  dl-new-hash \
> +  nss-hash \
> +# hash-benchset
> +
>   stdlib-benchset := strtod
>   
>   stdio-common-benchset := sprintf
> @@ -235,7 +241,7 @@ math-benchset := math-inlines
>   
>   ifeq (${BENCHSET},)
>   benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
> -	    $(math-benchset)
> +	    $(math-benchset) $(hash-benchset)
>   else
>   benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
>   endif
> @@ -363,9 +369,20 @@ bench-clean:
>   
>   # Validate the passed in BENCHSET
>   ifneq ($(strip ${BENCHSET}),)
> -VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
> -   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
> -   malloc-thread malloc-simple
> +VALIDBENCHSETNAMES := \
> +  bench-math \
> +  bench-pthread \
> +  bench-string \
> +  hash-benchset \
> +  malloc-simple \
> +  malloc-thread \
> +  math-benchset \
> +  stdio-common-benchset \
> +  stdlib-benchset \
> +  string-benchset \
> +  wcsmbs-benchset \
> +# VALIDBENCHSETNAMES
> +
>   INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
>   ifneq (${INVALIDBENCHSETNAMES},)
>   $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})

OK.

> diff --git a/benchtests/README b/benchtests/README
> index 4d83a05b4b..998ba9b2b4 100644
> --- a/benchtests/README
> +++ b/benchtests/README
> @@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
>       bench-math
>       bench-pthread
>       bench-string
> +    hash-benchset
> +    malloc-thread
> +    math-benchset
> +    stdio-common-benchset
> +    stdlib-benchset
>       string-benchset
>       wcsmbs-benchset
> -    stdlib-benchset
> -    stdio-common-benchset
> -    math-benchset
> -    malloc-thread
>   

OK.

>   Adding a function to benchtests:
>   ===============================
> diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
> new file mode 100644
> index 0000000000..5ca5116ad3
> --- /dev/null
> +++ b/benchtests/bench-dl-elf-hash.c
> @@ -0,0 +1,23 @@
> +/* Measure __dl_new_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <dl-hash.h>
> +#define TEST_FUNC(x, y) _dl_elf_hash (x)
> +#define TEST_NAME "_dl_elf_hash"
> +
> +#include "bench-hash-funcs.c"

Reusing infrastructure.  OK.

> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> new file mode 100644
> index 0000000000..f5be528960
> --- /dev/null
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -0,0 +1,23 @@
> +/* Measure __dl_new_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <elf/dl-new-hash.h>
> +#define TEST_FUNC(x, y) _dl_new_hash (x)
> +#define TEST_NAME "_dl_new_hash"
> +
> +#include "bench-hash-funcs.c"

Same.  OK.

> diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
> new file mode 100644
> index 0000000000..85cf7de8bc
> --- /dev/null
> +++ b/benchtests/bench-hash-funcs.c
> @@ -0,0 +1,196 @@
> +/* Measure hash functions runtime.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#ifndef TEST_FUNC
> +# error "No TEST_FUNC provided!"
> +#endif
> +
> +#ifndef TEST_NAME
> +# define STRINGIFY_PRIMITIVE(x) #  x
> +# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
> +
> +# define TEST_NAME STRINGIFY (TEST_FUNC)
> +#endif
> +
> +#include "json-lib.h"
> +#include "bench-timing.h"
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
> +
> +enum
> +{
> +  NFIXED_ITERS = 1048576,
> +  NRAND_BUFS = 16384,
> +  NRAND_ITERS = 2048,
> +  RAND_BENCH_MAX_LEN = 256
> +};
> +
> +static double __attribute__ ((noinline, noclone))
> +do_one_test_kernel (const char *s, size_t len)
> +{
> +
> +  unsigned int iters;
> +  timing_t start, stop, cur;
> +
> +  /* Warmup.  */
> +  for (iters = NFIXED_ITERS / 32; iters; --iters)
> +    {
> +      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
> +    }
> +
> +  TIMING_NOW (start);
> +  for (iters = NFIXED_ITERS; iters; --iters)
> +    {
> +      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  (void) (len);
> +  return (double) cur / (double) NFIXED_ITERS;
> +}
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, size_t len)
> +{
> +  char buf[len + 1];
> +  memset (buf, -1, len);
> +  buf[len] = '\0';
> +
> +  json_element_object_begin (json_ctx);
> +
> +  json_attr_string (json_ctx, "type", "fixed");
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
> +
> +  json_element_object_end (json_ctx);
> +}
> +static double
> +do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
> +{
> +  unsigned int i, iters;
> +  size_t offset;
> +  timing_t start, stop, cur;
> +
> +  /* Warmup.  */
> +  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
> +    {
> +      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
> +    }
> +
> +  TIMING_NOW (start);
> +  for (iters = NRAND_ITERS; iters; --iters)
> +    {
> +      for (i = 0, offset = 0; i < NRAND_BUFS;
> +	   ++i, offset += RAND_BENCH_MAX_LEN)
> +	{
> +	  DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
> +	}
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  (void) (sizes);
> +  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
> +}
> +
> +static void __attribute__ ((noinline, noclone))
> +do_rand_test (json_ctx_t *json_ctx)
> +{
> +  size_t i, sz, offset;
> +  char *bufs;
> +  unsigned int *sizes;
> +
> +  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
> +  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
> +  if (bufs == NULL || sizes == NULL)
> +    {
> +      fprintf (stderr, "Failed to allocate bufs for random test\n");
> +      goto done;
> +    }
> +
> +  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
> +    {
> +      json_element_object_begin (json_ctx);
> +      json_attr_string (json_ctx, "type", "random");
> +      json_attr_uint (json_ctx, "length", sz);
> +
> +      for (i = 0, offset = 0; i < NRAND_BUFS;
> +	   ++i, offset += RAND_BENCH_MAX_LEN)
> +	{
> +	  sizes[i] = random () % sz;
> +	  memset (bufs + offset, -1, sizes[i]);
> +	  bufs[offset + sizes[i]] = '\0';
> +	}
> +
> +      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
> +      json_element_object_end (json_ctx);
> +    }
> +
> +done:
> +  if (bufs)
> +    {
> +      free (bufs);
> +    }
> +  if (sizes)
> +    {
> +      free (sizes);
> +    }
> +}
> +
> +static int
> +do_test (void)
> +{
> +  int i;
> +  json_ctx_t json_ctx;
> +
> +  json_init (&json_ctx, 0, stdout);
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_array_begin (&json_ctx, "results");
> +
> +  for (i = 0; i < 16; ++i)
> +    {
> +      do_one_test (&json_ctx, i);
> +    }
> +
> +  for (i = 16; i <= 256; i += i)
> +    {
> +      do_one_test (&json_ctx, i);
> +    }
> +
> +  do_rand_test (&json_ctx);
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return 0;
> +}

Please remove all redundant parantheses.  The benchmark looks OK, but 
how about also benchmarking the reference implementation in 
elf/dl-hash.h and elf/dl-new-hash.h so that we always have a comparison 
point, similar to the string benchmarks?

> +
> +#include <support/test-driver.c>
> diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
> new file mode 100644
> index 0000000000..085e1f8ee2
> --- /dev/null
> +++ b/benchtests/bench-nss-hash.c
> @@ -0,0 +1,24 @@
> +/* Measure __nss_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nss.h>
> +#define TEST_FUNC __nss_hash
> +
> +uint32_t __nss_hash (const void *__key, size_t __length);
> +
> +#include "bench-hash-funcs.c"

Reusing infrastructure.  OK.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-16 20:30   ` [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-17  5:11     ` Siddhesh Poyarekar
  2022-05-18 17:34       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  5:11 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> The prior unrolling didn't really do much as it left the dependency
> chain between iterations. Unrolled the loop for 4 so 4x multiplies
> could be pipelined in out-of-order machines.
> 
> Results for __nss_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.845
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    4.019,    3.729,               1.078
>   fixed,      1,     4.95,    5.707,               0.867
>   fixed,      2,    5.152,    5.657,               0.911
>   fixed,      3,    4.641,    5.721,               0.811
>   fixed,      4,    5.551,     5.81,               0.955
>   fixed,      5,    6.525,    6.552,               0.996
>   fixed,      6,    6.711,    6.561,               1.023
>   fixed,      7,    6.715,    6.767,               0.992
>   fixed,      8,    7.874,    7.915,               0.995
>   fixed,      9,    8.888,    9.767,                0.91
>   fixed,     10,    8.959,    9.762,               0.918
>   fixed,     11,    9.188,    9.987,                0.92
>   fixed,     12,    9.708,   10.618,               0.914
>   fixed,     13,   10.393,    11.14,               0.933
>   fixed,     14,   10.628,   12.097,               0.879
>   fixed,     15,   10.982,   12.965,               0.847
>   fixed,     16,   11.851,   14.429,               0.821
>   fixed,     32,   24.334,   34.414,               0.707
>   fixed,     64,   55.618,   86.688,               0.642
>   fixed,    128,  118.261,   224.36,               0.527
>   fixed,    256,  256.183,  538.629,               0.476
> random,      2,   11.194,   11.556,               0.969
> random,      4,   17.516,   17.205,               1.018
> random,      8,   23.501,   20.985,                1.12
> random,     16,   28.131,   29.212,               0.963
> random,     32,   35.436,   38.662,               0.917
> random,     64,    45.74,   58.868,               0.777
> random,    128,   75.394,  121.963,               0.618
> random,    256,  139.524,  260.726,               0.535
> ---
>   nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
>   1 file changed, 42 insertions(+), 37 deletions(-)
> 
> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index 27a348ea9b..c6a375f386 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -19,58 +19,63 @@
>   
>   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
>   /*
> - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
>    * units.  On the first time through the loop we get the "leftover bytes"
> - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> - * this routine is heavily used enough, it's worth the ugly coding.
> + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> + * HASHC. Further unrolling does not appear to help.
>    *
>    * OZ's original sdbm hash
>    */
>   uint32_t
>   __nss_hash (const void *keyarg, size_t len)
>   {
> +  enum
> +  {
> +    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
> +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> +  };
> +
>     const unsigned char *key;
> -  size_t loop;
>     uint32_t h;
>   
> -#define HASHC   h = *key++ + 65599 * h
> +#define HASHC	h = *key++ + HASH_CONST_P1 * h
>   
>     h = 0;
>     key = keyarg;
>     if (len > 0)
>       {
> -      loop = (len + 8 - 1) >> 3;
> -      switch (len & (8 - 1))
> -        {
> -        case 0:
> -          do
> -            {
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 7:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 6:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 5:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 4:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 3:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 2:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 1:
> -              HASHC;
> -            }
> -	  while (--loop);
> -        }
> +      switch ((len & (4 - 1)))
> +	{
> +	case 0:
> +	  /* h starts out as zero so no need to include the multiply. */
> +	  h = *key++;
> +	  /* FALLTHROUGH */
> +	case 3:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 2:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 1:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	}

The first 4 bytes, also sufficient for len <= 4.  OK.

> +
> +      uint32_t c0, c1, c2, c3;
> +      for (--len; len >= 4; len -= 4)
> +	{
> +	  c0 = (unsigned char) *(key + 0);
> +	  c1 = (unsigned char) *(key + 1);
> +	  c2 = (unsigned char) *(key + 2);
> +	  c3 = (unsigned char) *(key + 3);
> +	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> +	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> +
> +	  key += 4;
> +	}

Remaining larger lengths.  OK.

>       }
>     return h;
>   }

TBH this wins solely on the front of the code being easier to 
understand.  The fact that it is also faster in some cases is a bonus :)

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-16 20:30   ` [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-17  5:12     ` Siddhesh Poyarekar
  2022-05-18 17:38       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-17  5:12 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha; +Cc: Alexander Monakov

Not sure why, but the series failed to apply on trybot.  It's probably a 
trybot bug because it applied just fine on my up to date copy.

On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. The unrolling allows for
> pipelined multiplies.
> 
> As well, as an optional sysdep, reorder the operations and prevent
> reassosiation for better scheduling and higher ILP. This commit

reassociation.  Later in the patch too.

> only adds the barrier for x86, although it should be either no
> change or a win for any architecture.
> 
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    2.865,     2.72,               1.053
>   fixed,      1,    3.567,    2.489,               1.433
>   fixed,      2,    2.577,    3.649,               0.706
>   fixed,      3,    3.644,    5.983,               0.609
>   fixed,      4,    4.211,    6.833,               0.616
>   fixed,      5,    4.741,    9.372,               0.506
>   fixed,      6,    5.415,    9.561,               0.566
>   fixed,      7,    6.649,   10.789,               0.616
>   fixed,      8,    8.081,   11.808,               0.684
>   fixed,      9,    8.427,   12.935,               0.651
>   fixed,     10,    8.673,   14.134,               0.614
>   fixed,     11,    10.69,   15.408,               0.694
>   fixed,     12,   10.789,   16.982,               0.635
>   fixed,     13,   12.169,   18.411,               0.661
>   fixed,     14,   12.659,   19.914,               0.636
>   fixed,     15,   13.526,   21.541,               0.628
>   fixed,     16,   14.211,   23.088,               0.616
>   fixed,     32,   29.412,   52.722,               0.558
>   fixed,     64,    65.41,  142.351,               0.459
>   fixed,    128,  138.505,  295.625,               0.469
>   fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>   sysdeps/generic/dl-new-hash.h      | 114 +++++++++++++++++++++++++++++
>   {elf => sysdeps/x86}/dl-new-hash.h |  16 +---

This breaks the benchmark build, but including just dl-new-hash.h in the 
benchmark should fix it.

>   2 files changed, 117 insertions(+), 13 deletions(-)
>   create mode 100644 sysdeps/generic/dl-new-hash.h
>   rename {elf => sysdeps/x86}/dl-new-hash.h (77%)
> 
> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> new file mode 100644
> index 0000000000..84aa7991a4
> --- /dev/null
> +++ b/sysdeps/generic/dl-new-hash.h
> @@ -0,0 +1,114 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>

Same header included twice.

> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +   _dl_new_hash (const char *s)
> +   {
> +      uint32_t h = 5381;
> +      for (unsigned char c = *s; c != '\0'; c = *++s)
> +        h = h * 33 + c;
> +      return h;
> +   }
> +
> +   We can get better performance by slightly unrolling the
> +   loop to pipeline the multiples.

.. the multiples, which gcc cannot easily do due to dependencies across 
iterations.

> +   As well, as an architecture specific option we add asm statements
> +   to explicitly specifying order of operations to prevent

to explicitly specify the order...

> +   reassosiation of instructions that lengthens the loop carried
> +   dependency. This may have no affect as the compiler may have
> +   ordered instructions the same way without it but in testing this
> +   has not been the case for GCC. Improving GCC to reliably schedule
> +   instructions ideally cannot be easily done.
> +
> +   Architecture(s) that use the reassosiation barries are:
> +   x86
> +
> +   Note it is very unlikely the reassosiation barriers would
> +   de-optimize performance on any archictecture and with an imperfect

architecture

> +   compiler it may help performance, especially on out-of-order cpus,
> +   so it is suggested that the respective maintainers add them.  */

Suggest: "architecture maintainers are encouraged to benchmark this with 
__asm_reassociation_barrier defined to __asm__ like it is in x86."

> +
> +
> +#ifndef __asm_reassociation_barrier
> +# define __asm_reassociation_barrier(...)
> +#endif
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *str)
> +{
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Since hashed string is normally not empty, this is unlikely on the
> +	 first iteration of the loop.  */
> +      if (__glibc_unlikely (c0 == 0))
> +	return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +	{
> +	  /* Ideal instruction scheduling is:

Suggest: "Ideal computation order is"

> +	 c0 += h;
> +	 h *= 32;
> +	 h += c0;
> +
> +	 The __asm_reassociation_barrier() macro is a sysdep optional asm
> +	 statements to prevents reassosiation that would result in more
> +	 instruction interdependencies and worse scheduling.  */

This bit is redundant with the description at the top near 
__asm_reassociation_barrier.

> +	  c0 += h;
> +	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> +	  h = h * 32 + c0;
> +	  return h;
> +	}
> +
> +      /* Ideal instruction scheduling is:

Same: "Ideal computation order is"

> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;
> +
> +	 The __asm_reassociation_barrier() macro is a sysdep optional asm
> +	 statements to prevents reassosiation that would result in more
> +	 instruction interdependencies and worse scheduling.  */

This too is redundant.

> +      c1 += c0;
> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      __asm_reassociation_barrier("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
> +}
> +
> +#endif /* dl-new-hash.h */
> diff --git a/elf/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> similarity index 77%
> rename from elf/dl-new-hash.h
> rename to sysdeps/x86/dl-new-hash.h
> index b7a91ecc07..dd800265bf 100644
> --- a/elf/dl-new-hash.h
> +++ b/sysdeps/x86/dl-new-hash.h
> @@ -19,19 +19,9 @@
>   #ifndef _DL_NEW_HASH_H
>   #define _DL_NEW_HASH_H 1

No need to define this...

>   
> -#include <stdint.h>
> -/* For __always_inline.  */
> -#include <sys/cdefs.h>
> -
> -static __always_inline uint32_t
> -__attribute__ ((unused))
> -_dl_new_hash (const char *s)
> -{
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> -}
> +#define __asm_reassociation_barrier __asm__
>   
> +#undef _DL_NEW_HASH_H

... if it is unconditionally undefined here.

> +#include <sysdeps/generic/dl-new-hash.h>
>   
>   #endif /* dl-new-hash.h */


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (15 preceding siblings ...)
  2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
@ 2022-05-18 17:26 ` Noah Goldstein
  2022-05-18 17:26   ` [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (5 more replies)
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
  17 siblings, 6 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:26 UTC (permalink / raw)
  To: libc-alpha

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..8641bb4196
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,40 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline.  */
+#include <sys/cdefs.h>
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+/* For testing/benchmarking purposes.  */
+#define __simple_dl_new_hash _dl_new_hash
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
@ 2022-05-18 17:26   ` Noah Goldstein
  2022-05-19 14:49     ` Siddhesh Poyarekar
  2022-05-18 17:26   ` [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:26 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the functions tests are needed.
---
 elf/Makefile         |   1 +
 elf/simple-dl-hash.h |  42 ++++++++++++++++
 elf/tst-dl-hash.c    | 115 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+)
 create mode 100644 elf/simple-dl-hash.h
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index ce3345ed92..adf1bcf6ce 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -312,6 +312,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/simple-dl-hash.h b/elf/simple-dl-hash.h
new file mode 100644
index 0000000000..53702b3c55
--- /dev/null
+++ b/elf/simple-dl-hash.h
@@ -0,0 +1,42 @@
+/* __simple_dl_elf_hash for testing true elf symbol lookup.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SIMPLE_DL_ELF_HASH_H
+#define _SIMPLE_DL_ELF_HASH_H 1
+
+#include <stdint.h>
+
+/* For testing/benchmarking purposes.  Real implementation in
+   sysdeps/generic/dl-hash.h.  */
+static uint32_t
+__attribute__ ((unused))
+__simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+#endif /* simple-dl-hash.h */
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..8697eb73a0
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,115 @@
+/* Test dl-hash functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <simple-dl-hash.h>
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		res);
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &__simple_dl_new_hash))
+    return 1;
+
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &__simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	v = 1;
+
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &__simple_dl_new_hash))
+    return 1;
+
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &__simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    return 1;
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    return 1;
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
  2022-05-18 17:26   ` [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-18 17:26   ` Noah Goldstein
  2022-05-19 15:09     ` Siddhesh Poyarekar
  2022-05-18 17:26   ` [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:26 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile          |  1 +
 nss/nss_hash.c        | 16 +++++++++
 nss/simple-nss-hash.h | 42 +++++++++++++++++++++++
 nss/tst-nss-hash.c    | 80 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+)
 create mode 100644 nss/simple-nss-hash.h
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..f9e17d068a 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -75,4 +75,20 @@ __nss_hash (const void *keyarg, size_t len)
   return h;
 }
 
+/* For testing/benchmarking purposes. */
+static uint32_t
+__simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    h = *key++ + 65599 * h;
+
+  return h;
+}
+
+
 libc_hidden_def (__nss_hash)
diff --git a/nss/simple-nss-hash.h b/nss/simple-nss-hash.h
new file mode 100644
index 0000000000..47708972e7
--- /dev/null
+++ b/nss/simple-nss-hash.h
@@ -0,0 +1,42 @@
+/* __simple_nss_hash for testing nss_hash function
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SIMPLE_NSS_HASH_H
+#define _SIMPLE_NSS_HASH_H 1
+
+#include <stdint.h>
+
+/* For testing/benchmarking purposes.  Real implementation in
+   nss/nss_hash.c.  */
+static uint32_t
+__attribute__ ((unused))
+__simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    h = *key++ + 65599 * h;
+
+  return h;
+}
+
+
+#endif /* simple-nss-hash.h */
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..5ec1f9b0c5
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,80 @@
+/* Test __nss_hash
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+#include <simple-nss-hash.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = __simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    buf[i] = random ();
+
+  expec = __simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    return 1;
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    return 1;
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
  2022-05-18 17:26   ` [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-18 17:26   ` [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-18 17:26   ` Noah Goldstein
  2022-05-19 15:34     ` Siddhesh Poyarekar
  2022-05-18 17:26   ` [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:26 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile                  |  25 ++++-
 benchtests/README                    |   9 +-
 benchtests/bench-dl-elf-hash.c       |  27 +++++
 benchtests/bench-dl-new-hash.c       |  25 +++++
 benchtests/bench-hash-funcs-kernel.h |  92 ++++++++++++++++
 benchtests/bench-hash-funcs.c        | 152 +++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c          |  26 +++++
 7 files changed, 348 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs-kernel.h
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..067de9fca4
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,27 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#include <elf/simple-dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define SIMPLE_TEST_FUNC(x, y) __simple_dl_elf_hash (x)
+
+#define TEST_NAME "_dl_elf_hash"
+
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..3c8a1d5a82
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,25 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
+
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs-kernel.h b/benchtests/bench-hash-funcs-kernel.h
new file mode 100644
index 0000000000..9f9f245641
--- /dev/null
+++ b/benchtests/bench-hash-funcs-kernel.h
@@ -0,0 +1,92 @@
+/* Actual benchmark kernels used by bench-hash-funcs.h
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+
+/* We go through the trouble of using macros here because many of the
+   hash functions are meant to be inlined so its not fair to benchmark
+   them with a function pointer where they won't be inlinable. */
+#undef RUN_FUNC
+#undef POSTFIX
+#ifdef SIMPLE
+# define RUN_FUNC SIMPLE_TEST_FUNC
+# define POSTFIX _simple
+#else
+# define RUN_FUNC TEST_FUNC
+# define POSTFIX _optimized
+#endif
+
+#define PRIMITIVE_CAT(x, y) x ## y
+#define CAT(x, y) PRIMITIVE_CAT (x, y)
+
+static double __attribute__ ((noinline, noclone))
+CAT (do_one_test_kernel, POSTFIX) (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    {
+      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static double __attribute__ ((noinline, noclone))
+CAT (do_rand_test_kernel, POSTFIX) (char const *bufs,
+				    unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    {
+      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
+    }
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
+	}
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..3d3c736ffc
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,152 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+#ifndef SIMPLE_TEST_FUNC
+# error "No SIMPLE_TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 128
+};
+
+#include "bench-hash-funcs-kernel.h"
+#define SIMPLE
+#include "bench-hash-funcs-kernel.h"
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time_simple", do_one_test_kernel_simple (buf, len));
+  json_attr_double (json_ctx, "time_optimized", do_one_test_kernel_optimized (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time_simple",
+			do_rand_test_kernel_simple (bufs, sizes));
+      json_attr_double (json_ctx, "time_optimized",
+			do_rand_test_kernel_optimized (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    {
+      free (bufs);
+    }
+  if (sizes)
+    {
+      free (sizes);
+    }
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  for (i = 16; i <= 256; i += i)
+    {
+      do_one_test (&json_ctx, i);
+    }
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..7e369428a2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,26 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#include <nss/simple-nss-hash.h>
+#define TEST_FUNC __nss_hash
+#define SIMPLE_TEST_FUNC __simple_nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-18 17:26   ` [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-18 17:26   ` Noah Goldstein
  2022-05-19 15:41     ` Siddhesh Poyarekar
  2022-05-18 17:26   ` [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-05-19 14:47   ` [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:26 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 92 ++++++++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 51 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index f9e17d068a..1d3787e675 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,74 +19,64 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
-    }
-  return h;
-}
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
 
-/* For testing/benchmarking purposes. */
-static uint32_t
-__simple_nss_hash (const void *keyarg, size_t len)
-{
-  const unsigned char *key;
-  size_t i;
-  uint32_t h = 0;
-  key = keyarg;
-
-  for (i = 0; i < len; ++i)
-    h = *key++ + 65599 * h;
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
 
+	  key += 4;
+	}
+    }
   return h;
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-18 17:26   ` [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-18 17:26   ` Noah Goldstein
  2022-05-18 17:32     ` H.J. Lu
  2022-05-19 15:55     ` Siddhesh Poyarekar
  2022-05-19 14:47   ` [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
  5 siblings, 2 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:26 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. The unrolling allows for
pipelined multiplies.

As well, as an optional sysdep, reorder the operations and prevent
reassosiation for better scheduling and higher ILP. This commit
only adds the barrier for x86, although it should be either no
change or a win for any architecture.

Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 benchtests/bench-dl-new-hash.c              |   3 +-
 elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
 elf/tst-dl-hash.c                           |   1 +
 sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
 sysdeps/x86/dl-new-hash.h                   |  24 +++++
 5 files changed, 146 insertions(+), 13 deletions(-)
 rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
 create mode 100644 sysdeps/generic/dl-new-hash.h
 create mode 100644 sysdeps/x86/dl-new-hash.h

diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
index 3c8a1d5a82..040fa7ce01 100644
--- a/benchtests/bench-dl-new-hash.c
+++ b/benchtests/bench-dl-new-hash.c
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <elf/dl-new-hash.h>
+#include <dl-new-hash.h>
+#include <elf/simple-dl-new-hash.h>
 #define TEST_FUNC(x, y) _dl_new_hash (x)
 #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
 
diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
similarity index 75%
rename from elf/dl-new-hash.h
rename to elf/simple-dl-new-hash.h
index 8641bb4196..1437b1bd36 100644
--- a/elf/dl-new-hash.h
+++ b/elf/simple-dl-new-hash.h
@@ -1,4 +1,4 @@
-/* _dl_new_hash for elf symbol lookup
+/* __simple_dl_new_hash for testing true elf symbol lookup.
    Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,16 +16,16 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef _DL_NEW_HASH_H
-#define _DL_NEW_HASH_H 1
+#ifndef _SIMPLE_DL_NEW_HASH_H
+#define _SIMPLE_DL_NEW_HASH_H 1
 
 #include <stdint.h>
-/* For __always_inline.  */
-#include <sys/cdefs.h>
 
-static __always_inline uint32_t
+/* For testing/benchmarking purposes.  Real implementation in
+   sysdeps/generic/dl-new-hash.h.  */
+static uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+__simple_dl_new_hash (const char *s)
 {
   uint32_t h = 5381;
   for (unsigned char c = *s; c != '\0'; c = *++s)
@@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
   return h;
 }
 
-/* For testing/benchmarking purposes.  */
-#define __simple_dl_new_hash _dl_new_hash
-
-
-#endif /* dl-new-hash.h */
+#endif /* simple-dl-new-hash.h */
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
index 8697eb73a0..b21766c63d 100644
--- a/elf/tst-dl-hash.c
+++ b/elf/tst-dl-hash.c
@@ -18,6 +18,7 @@
 
 
 #include <simple-dl-hash.h>
+#include <simple-dl-new-hash.h>
 #include <dl-hash.h>
 #include <dl-new-hash.h>
 #include <support/support.h>
diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
new file mode 100644
index 0000000000..1faf309c97
--- /dev/null
+++ b/sysdeps/generic/dl-new-hash.h
@@ -0,0 +1,111 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline.  */
+#include <sys/cdefs.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+   _dl_new_hash (const char *s)
+   {
+      uint32_t h = 5381;
+      for (unsigned char c = *s; c != '\0'; c = *++s)
+        h = h * 33 + c;
+      return h;
+   }
+
+   We can get better performance by slightly unrolling the loop to
+   pipeline the multiples, which gcc cannot easily do due to
+   dependencies across iterations.
+
+   As well, as an architecture specific option we add asm statements
+   to explicitly specify order of operations and prevent reassociation
+   of instructions that lengthens the loop carried dependency. This
+   may have no affect as the compiler may have ordered instructions
+   the same way without it but in testing this has not been the case
+   for GCC. Improving GCC to reliably schedule instructions ideally
+   cannot be easily done.
+
+   Architecture(s) that use the reassociation barries are:
+   x86
+
+   Note it is very unlikely the reassociation barriers would
+   de-optimize performance on any architecture and with an imperfect
+   compiler it may help performance, especially on out-of-order cpus,
+   so it is suggested that the respective maintainers add them.
+
+   architecture maintainers are encouraged to benchmark this with
+   __asm_reassociation_barrier defined to __asm__ like it is in x86.
+*/
+
+
+#ifndef __asm_reassociation_barrier
+# define __asm_reassociation_barrier(...)
+#endif
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *str)
+{
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = s[0];
+      /* Since hashed string is normally not empty, this is unlikely on the
+	 first iteration of the loop.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = s[1];
+      if (c1 == 0)
+	{
+	  /* Ideal computational order is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;  */
+	  c0 += h;
+	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal computational order is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;  */
+      c1 += c0;
+      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      __asm_reassociation_barrier("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
+}
+
+#endif /* dl-new-hash.h */
diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
new file mode 100644
index 0000000000..ce8fb5a838
--- /dev/null
+++ b/sysdeps/x86/dl-new-hash.h
@@ -0,0 +1,24 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef __asm_reassociation_barrier
+# error "__asm_reassociation_barrier should never already be defined."
+#endif
+
+#define __asm_reassociation_barrier __asm__
+#include <sysdeps/generic/dl-new-hash.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-17  3:34   ` [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
@ 2022-05-18 17:28     ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:28 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Mon, May 16, 2022 at 10:34 PM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 17/05/2022 01:59, Noah Goldstein via Libc-alpha wrote:
> > No change to the code other than moving the function to
> > dl-new-hash.h. Changed name so its now in the reserved namespace.
> > ---
> >   elf/dl-lookup.c   | 13 ++-----------
> >   elf/dl-new-hash.h | 37 +++++++++++++++++++++++++++++++++++++
> >   2 files changed, 39 insertions(+), 11 deletions(-)
> >   create mode 100644 elf/dl-new-hash.h
>
> LGTM.
>
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
>

Slightly modified in V10 adding the #define __simple_....
> > diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> > index 989b073e4f..a42f6d5390 100644
> > --- a/elf/dl-lookup.c
> > +++ b/elf/dl-lookup.c
> > @@ -24,6 +24,7 @@
> >   #include <ldsodefs.h>
> >   #include <dl-hash.h>
> >   #include <dl-machine.h>
> > +#include <dl-new-hash.h>
> >   #include <dl-protected.h>
> >   #include <sysdep-cancel.h>
> >   #include <libc-lock.h>
> > @@ -558,16 +559,6 @@ skip:
> >   }
> >
> >
> > -static uint32_t
> > -dl_new_hash (const char *s)
> > -{
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > -}
> > -
> > -
> >   /* Add extra dependency on MAP to UNDEF_MAP.  */
> >   static int
> >   add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> > @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
> >                    const struct r_found_version *version,
> >                    int type_class, int flags, struct link_map *skip_map)
> >   {
> > -  const unsigned int new_hash = dl_new_hash (undef_name);
> > +  const unsigned int new_hash = _dl_new_hash (undef_name);
> >     unsigned long int old_hash = 0xffffffff;
> >     struct sym_val current_value = { NULL, NULL };
> >     struct r_scope_elem **scope = symbol_scope;
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..b7a91ecc07
> > --- /dev/null
> > +++ b/elf/dl-new-hash.h
> > @@ -0,0 +1,37 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *s)
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
> > +
> > +
> > +#endif /* dl-new-hash.h */
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-17  4:19     ` Siddhesh Poyarekar
@ 2022-05-18 17:29       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:29 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Mon, May 16, 2022 at 11:19 PM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > If we want to further optimize the functions tests are needed.
> > ---
> >   elf/Makefile      |   1 +
> >   elf/tst-dl-hash.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
> >   2 files changed, 148 insertions(+)
> >   create mode 100644 elf/tst-dl-hash.c
> >
> > diff --git a/elf/Makefile b/elf/Makefile
> > index fc9860edee..0e72f913a0 100644
> > --- a/elf/Makefile
> > +++ b/elf/Makefile
> > @@ -309,6 +309,7 @@ tests := \
> >     tst-array4 \
> >     tst-array5 \
> >     tst-auxv \
> > +  tst-dl-hash \
> >     tst-leaks1 \
> >     tst-stringtable \
> >     tst-tls9 \
> > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> > new file mode 100644
> > index 0000000000..e806a274ca
> > --- /dev/null
> > +++ b/elf/tst-dl-hash.c
> > @@ -0,0 +1,147 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +/* Simple implementation of ELF ABI hash function. */
>
> The one line description is typically the first line at the top, just
> before the copyright notice.  And perhaps you want to call it "Test ELF
> ABI hash functions" or something similar :)

Fixed in V10.
>
> > +
> > +#include <dl-hash.h>
> > +#include <dl-new-hash.h>
> > +#include <support/support.h>
> > +#include <support/check.h>
> > +#include <stdio.h>
> > +#include <string.h>
> > +#include <stdlib.h>
> > +
> > +typedef unsigned int (*hash_f) (const char *);
> > +
> > +static unsigned int
> > +simple_dl_new_hash (const char *s)
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
>
> Maybe just `#define dl_new_hash simple_dl_new_hash` and include
> elf/dl-new-hash.h here?  And then don't get rid of elf/dl-new-hash.h in
> 6/6, let it remain the reference implementation to test against.
> Perhaps also add a comment in that file stating that it is a reference
> implementation to test against and that sysdeps has the actual
> implementation that gets used, depending on the target.

Done in V10.
>
> > +
> > +static unsigned int
> > +simple_dl_elf_hash (const char *name_arg)
> > +{
> > +  unsigned long int hash = 0;
> > +  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
> > +    {
> > +      unsigned long int hi;
> > +      hash = (hash << 4) + c;
> > +      hi = hash & 0xf0000000;
> > +      hash ^= hi >> 24;
> > +      hash &= 0x0fffffff;
> > +    }
> > +  return hash;
> > +}
>
> Likewise, add elf/dl-hash.h with this reference implementation.

Done in V10.
>
> > +static int
> > +do_fill_test (size_t len, int fill, const char *name, hash_f testf,
> > +           hash_f expecf)
> > +{
> > +  uint32_t expec, res;
> > +  char buf[len + 1];
> > +  memset (buf, fill, len);
> > +  buf[len] = '\0';
> > +
> > +  expec = expecf (buf);
> > +  res = testf (buf);
> > +  if (expec != res)
> > +    {
> > +      FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
> > +               res);
> > +    }
> > +
> > +  return 0;
> > +}
> > +
> > +static int
> > +do_fill_tests (size_t len, int fill)
> > +{
> > +  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
> > +                 &simple_dl_new_hash))
> > +    {
>
> Redundant paranthesis.
Fixed in V10.
>
> > +      return 1;
> > +    }
> > +  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
> > +                    &simple_dl_elf_hash);
> > +}
> > +
> > +static int
> > +do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
> > +{
> > +  uint32_t expec, res;
> > +  size_t i;
> > +  char buf[len + 1];
> > +  char v;
> > +  for (i = 0; i < len; ++i)
> > +    {
> > +      v = random ();
> > +      if (v == 0)
> > +     {
>
> Likewise.
Fixed in V10.
>
> > +       v = 1;
> > +     }
> > +      buf[i] = v;
> > +    }
> > +  buf[len] = '\0';
> > +
> > +  expec = expecf (buf);
> > +  res = testf (buf);
> > +  if (expec != res)
> > +    {
> > +      printf ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
> > +      return 1;
> > +    }
> > +
> > +  return 0;
> > +}
> > +
> > +static int
> > +do_rand_tests (size_t len)
> > +{
> > +  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &simple_dl_new_hash))
> > +    {
>
> Likewise.
Fixed in V10.
>
> > +      return 1;
> > +    }
> > +  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &simple_dl_elf_hash);
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  size_t i, j;
> > +  for (i = 0; i < 100; ++i)
> > +    {
> > +      for (j = 0; j < 8192; ++j)
> > +     {
> > +       if (do_rand_tests (i))
> > +         {
>
> Likewise.
Fixed in V10.
>
> > +           return 1;
> > +         }
> > +
> > +       if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> > +           || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> > +         {
>
> Likewise.
Fixed in V10.
>
> > +           return 1;
> > +         }
> > +     }
> > +    }
> > +  return 0;
> > +}
> > +
> > +#include <support/test-driver.c>
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-17  4:32     ` Siddhesh Poyarekar
@ 2022-05-18 17:30       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:30 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Mon, May 16, 2022 at 11:32 PM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > If we want to further optimize the function tests are needed.
> > ---
> >   nss/Makefile       |   1 +
> >   nss/tst-nss-hash.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
> >   2 files changed, 106 insertions(+)
> >   create mode 100644 nss/tst-nss-hash.c
> >
> > diff --git a/nss/Makefile b/nss/Makefile
> > index d8b06b44fb..a978e3927a 100644
> > --- a/nss/Makefile
> > +++ b/nss/Makefile
> > @@ -62,6 +62,7 @@ tests := \
> >     test-digits-dots \
> >     test-netdb \
> >     tst-nss-getpwent \
> > +  tst-nss-hash \
> >     tst-nss-test1 \
> >     tst-nss-test2 \
> >     tst-nss-test4 \
> > diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
> > new file mode 100644
> > index 0000000000..6bb2ce06ab
> > --- /dev/null
> > +++ b/nss/tst-nss-hash.c
> > @@ -0,0 +1,105 @@
> > +/* Test __nss_hash
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
>
> New file, so only 2022?

Fixed in V10.
>
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <support/support.h>
> > +#include <support/check.h>
> > +#include <stdio.h>
> > +#include <string.h>
> > +#include <stdlib.h>
> > +#include <nss.h>
> > +
> > +uint32_t __nss_hash (const void *__key, size_t __length);
> > +
> > +/* Simplist implementation of __nss_hash. */
> > +static uint32_t
> > +simple_nss_hash (const void *keyarg, size_t len)
> > +{
> > +  const unsigned char *key;
> > +  size_t i;
> > +  uint32_t h = 0;
> > +  key = keyarg;
> > +
> > +  for (i = 0; i < len; ++i)
> > +    {
> > +      h = *key++ + 65599 * h;
> > +    }
> > +  return h;
> > +}
>
> Same as dl-hash, it may make sense to maintain this in elf/nss-hash.c:

Done in V10.
>
> #ifdef __nss_hash
> /* Describe the reference macro */
> static uint32_t
> __nss_hash (const void *keyarg, size_t len)
> {
>    const unsigned char *key;
>    size_t i;
>    uint32_t h = 0;
>    key = keyarg;
>
>    for (i = 0; i < len; ++i)
>      {
>        h = *key++ + 65599 * h;
>      }
>    return h;
> }
> #else
> static uint32_t
> __nss_hash (const void *keyarg, size_t len)
> {
>    ... the current implementation...
> }
> #endif
>
> It can then be included like so:
>
> #define __nss_hash simple_nss_hash
> #include "nss_hash.c"
>
> > +
> > +static int
> > +do_fill_tests (size_t len, int fill)
> > +{
> > +  uint32_t expec, res;
> > +  char buf[len];
> > +  memset (buf, fill, len);
> > +
> > +  expec = simple_nss_hash (buf, len);
> > +  res = __nss_hash (buf, len);
> > +  if (expec != res)
> > +    {
>
> Redundant paranthesis.
Fixed in V10.
>
> > +      FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
> > +    }
> > +
> > +  return 0;
> > +}
> > +
> > +static int
> > +do_rand_tests (size_t len)
> > +{
> > +  uint32_t expec, res;
> > +  size_t i;
> > +  char buf[len];
> > +  for (i = 0; i < len; ++i)
> > +    {
>
> Redundant paranthesis.
Fixed in V10.
>
> > +      buf[i] = random ();
> > +    }
> > +
> > +  expec = simple_nss_hash (buf, len);
> > +  res = __nss_hash (buf, len);
> > +  if (expec != res)
> > +    {
> > +      printf ("FAIL: random (%zu), %x != %x\n", len, expec, res);
> > +      return 1;
> > +    }
> > +
> > +  return 0;
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  size_t i, j;
> > +  for (i = 0; i < 100; ++i)
> > +    {
> > +      for (j = 0; j < 8192; ++j)
> > +     {
> > +       if (do_rand_tests (i))
> > +         {
>
> Redundant paranthesis.
Fixed in V10.
>
> > +           return 1;
> > +         }
> > +       if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> > +           || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> > +         {
>
> Redundant paranthesis.
>
> > +           return 1;
> > +         }
> > +     }
> > +    }
> > +  return 0;
> > +}
> > +
> > +#include <support/test-driver.c>
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-18 17:26   ` [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-18 17:32     ` H.J. Lu
  2022-05-18 17:39       ` Noah Goldstein
  2022-05-19  7:53       ` Siddhesh Poyarekar
  2022-05-19 15:55     ` Siddhesh Poyarekar
  1 sibling, 2 replies; 167+ messages in thread
From: H.J. Lu @ 2022-05-18 17:32 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell, Alexander Monakov

andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein
<goldstein.w.n@gmail.com> wrote:
>
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. The unrolling allows for
> pipelined multiplies.
>
> As well, as an optional sysdep, reorder the operations and prevent
> reassosiation for better scheduling and higher ILP. This commit
> only adds the barrier for x86, although it should be either no
> change or a win for any architecture.
>
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
>
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    2.865,     2.72,               1.053
>  fixed,      1,    3.567,    2.489,               1.433
>  fixed,      2,    2.577,    3.649,               0.706
>  fixed,      3,    3.644,    5.983,               0.609
>  fixed,      4,    4.211,    6.833,               0.616
>  fixed,      5,    4.741,    9.372,               0.506
>  fixed,      6,    5.415,    9.561,               0.566
>  fixed,      7,    6.649,   10.789,               0.616
>  fixed,      8,    8.081,   11.808,               0.684
>  fixed,      9,    8.427,   12.935,               0.651
>  fixed,     10,    8.673,   14.134,               0.614
>  fixed,     11,    10.69,   15.408,               0.694
>  fixed,     12,   10.789,   16.982,               0.635
>  fixed,     13,   12.169,   18.411,               0.661
>  fixed,     14,   12.659,   19.914,               0.636
>  fixed,     15,   13.526,   21.541,               0.628
>  fixed,     16,   14.211,   23.088,               0.616
>  fixed,     32,   29.412,   52.722,               0.558
>  fixed,     64,    65.41,  142.351,               0.459
>  fixed,    128,  138.505,  295.625,               0.469
>  fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
>
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  benchtests/bench-dl-new-hash.c              |   3 +-
>  elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>  elf/tst-dl-hash.c                           |   1 +
>  sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
>  sysdeps/x86/dl-new-hash.h                   |  24 +++++
>  5 files changed, 146 insertions(+), 13 deletions(-)
>  rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>  create mode 100644 sysdeps/generic/dl-new-hash.h
>  create mode 100644 sysdeps/x86/dl-new-hash.h
>
> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> index 3c8a1d5a82..040fa7ce01 100644
> --- a/benchtests/bench-dl-new-hash.c
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -16,7 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <elf/dl-new-hash.h>
> +#include <dl-new-hash.h>
> +#include <elf/simple-dl-new-hash.h>
>  #define TEST_FUNC(x, y) _dl_new_hash (x)
>  #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>
> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> similarity index 75%
> rename from elf/dl-new-hash.h
> rename to elf/simple-dl-new-hash.h
> index 8641bb4196..1437b1bd36 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/simple-dl-new-hash.h
> @@ -1,4 +1,4 @@
> -/* _dl_new_hash for elf symbol lookup
> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>     Copyright (C) 2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -16,16 +16,16 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#ifndef _DL_NEW_HASH_H
> -#define _DL_NEW_HASH_H 1
> +#ifndef _SIMPLE_DL_NEW_HASH_H
> +#define _SIMPLE_DL_NEW_HASH_H 1
>
>  #include <stdint.h>
> -/* For __always_inline.  */
> -#include <sys/cdefs.h>
>
> -static __always_inline uint32_t
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-new-hash.h.  */
> +static uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +__simple_dl_new_hash (const char *s)
>  {
>    uint32_t h = 5381;
>    for (unsigned char c = *s; c != '\0'; c = *++s)
> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>    return h;
>  }
>
> -/* For testing/benchmarking purposes.  */
> -#define __simple_dl_new_hash _dl_new_hash
> -
> -
> -#endif /* dl-new-hash.h */
> +#endif /* simple-dl-new-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> index 8697eb73a0..b21766c63d 100644
> --- a/elf/tst-dl-hash.c
> +++ b/elf/tst-dl-hash.c
> @@ -18,6 +18,7 @@
>
>
>  #include <simple-dl-hash.h>
> +#include <simple-dl-new-hash.h>
>  #include <dl-hash.h>
>  #include <dl-new-hash.h>
>  #include <support/support.h>
> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> new file mode 100644
> index 0000000000..1faf309c97
> --- /dev/null
> +++ b/sysdeps/generic/dl-new-hash.h
> @@ -0,0 +1,111 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +   _dl_new_hash (const char *s)
> +   {
> +      uint32_t h = 5381;
> +      for (unsigned char c = *s; c != '\0'; c = *++s)
> +        h = h * 33 + c;
> +      return h;
> +   }
> +
> +   We can get better performance by slightly unrolling the loop to
> +   pipeline the multiples, which gcc cannot easily do due to
> +   dependencies across iterations.
> +
> +   As well, as an architecture specific option we add asm statements
> +   to explicitly specify order of operations and prevent reassociation
> +   of instructions that lengthens the loop carried dependency. This
> +   may have no affect as the compiler may have ordered instructions
> +   the same way without it but in testing this has not been the case
> +   for GCC. Improving GCC to reliably schedule instructions ideally
> +   cannot be easily done.
> +
> +   Architecture(s) that use the reassociation barries are:
> +   x86
> +
> +   Note it is very unlikely the reassociation barriers would
> +   de-optimize performance on any architecture and with an imperfect
> +   compiler it may help performance, especially on out-of-order cpus,
> +   so it is suggested that the respective maintainers add them.
> +
> +   architecture maintainers are encouraged to benchmark this with
> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> +*/
> +
> +
> +#ifndef __asm_reassociation_barrier
> +# define __asm_reassociation_barrier(...)
> +#endif
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *str)
> +{
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Since hashed string is normally not empty, this is unlikely on the
> +        first iteration of the loop.  */
> +      if (__glibc_unlikely (c0 == 0))
> +       return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +       {
> +         /* Ideal computational order is:
> +        c0 += h;
> +        h *= 32;
> +        h += c0;  */
> +         c0 += h;
> +         __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> +         h = h * 32 + c0;
> +         return h;
> +       }
> +
> +      /* Ideal computational order is:
> +        c1 += c0;
> +        h *= 33 * 33;
> +        c0 *= 32;
> +        c1 += c0;
> +        h  += c1;  */
> +      c1 += c0;
> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      __asm_reassociation_barrier("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
> +}
> +
> +#endif /* dl-new-hash.h */
> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> new file mode 100644
> index 0000000000..ce8fb5a838
> --- /dev/null
> +++ b/sysdeps/x86/dl-new-hash.h
> @@ -0,0 +1,24 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifdef __asm_reassociation_barrier
> +# error "__asm_reassociation_barrier should never already be defined."
> +#endif
> +
> +#define __asm_reassociation_barrier __asm__
> +#include <sysdeps/generic/dl-new-hash.h>
> --
> 2.34.1
>

Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h
and leave the generic one unchanged?

-- 
H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-17  4:52     ` Siddhesh Poyarekar
@ 2022-05-18 17:33       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:33 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Mon, May 16, 2022 at 11:52 PM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > Benchtests are for throughput and include random / fixed size
> > benchmarks.
> > ---
> >   benchtests/Makefile            |  25 ++++-
> >   benchtests/README              |   9 +-
> >   benchtests/bench-dl-elf-hash.c |  23 ++++
> >   benchtests/bench-dl-new-hash.c |  23 ++++
> >   benchtests/bench-hash-funcs.c  | 196 +++++++++++++++++++++++++++++++++
> >   benchtests/bench-nss-hash.c    |  24 ++++
> >   6 files changed, 292 insertions(+), 8 deletions(-)
> >   create mode 100644 benchtests/bench-dl-elf-hash.c
> >   create mode 100644 benchtests/bench-dl-new-hash.c
> >   create mode 100644 benchtests/bench-hash-funcs.c
> >   create mode 100644 benchtests/bench-nss-hash.c
> >
> > diff --git a/benchtests/Makefile b/benchtests/Makefile
> > index de9de5cf58..c279041e19 100644
> > --- a/benchtests/Makefile
> > +++ b/benchtests/Makefile
> > @@ -227,6 +227,12 @@ LOCALES := \
> >   include ../gen-locales.mk
> >   endif
> >
> > +hash-benchset := \
> > +  dl-elf-hash \
> > +  dl-new-hash \
> > +  nss-hash \
> > +# hash-benchset
> > +
> >   stdlib-benchset := strtod
> >
> >   stdio-common-benchset := sprintf
> > @@ -235,7 +241,7 @@ math-benchset := math-inlines
> >
> >   ifeq (${BENCHSET},)
> >   benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
> > -         $(math-benchset)
> > +         $(math-benchset) $(hash-benchset)
> >   else
> >   benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
> >   endif
> > @@ -363,9 +369,20 @@ bench-clean:
> >
> >   # Validate the passed in BENCHSET
> >   ifneq ($(strip ${BENCHSET}),)
> > -VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
> > -   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
> > -   malloc-thread malloc-simple
> > +VALIDBENCHSETNAMES := \
> > +  bench-math \
> > +  bench-pthread \
> > +  bench-string \
> > +  hash-benchset \
> > +  malloc-simple \
> > +  malloc-thread \
> > +  math-benchset \
> > +  stdio-common-benchset \
> > +  stdlib-benchset \
> > +  string-benchset \
> > +  wcsmbs-benchset \
> > +# VALIDBENCHSETNAMES
> > +
> >   INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
> >   ifneq (${INVALIDBENCHSETNAMES},)
> >   $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
>
> OK.
>
> > diff --git a/benchtests/README b/benchtests/README
> > index 4d83a05b4b..998ba9b2b4 100644
> > --- a/benchtests/README
> > +++ b/benchtests/README
> > @@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
> >       bench-math
> >       bench-pthread
> >       bench-string
> > +    hash-benchset
> > +    malloc-thread
> > +    math-benchset
> > +    stdio-common-benchset
> > +    stdlib-benchset
> >       string-benchset
> >       wcsmbs-benchset
> > -    stdlib-benchset
> > -    stdio-common-benchset
> > -    math-benchset
> > -    malloc-thread
> >
>
> OK.
>
> >   Adding a function to benchtests:
> >   ===============================
> > diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
> > new file mode 100644
> > index 0000000000..5ca5116ad3
> > --- /dev/null
> > +++ b/benchtests/bench-dl-elf-hash.c
> > @@ -0,0 +1,23 @@
> > +/* Measure __dl_new_hash runtime
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <dl-hash.h>
> > +#define TEST_FUNC(x, y) _dl_elf_hash (x)
> > +#define TEST_NAME "_dl_elf_hash"
> > +
> > +#include "bench-hash-funcs.c"
>
> Reusing infrastructure.  OK.
>
> > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> > new file mode 100644
> > index 0000000000..f5be528960
> > --- /dev/null
> > +++ b/benchtests/bench-dl-new-hash.c
> > @@ -0,0 +1,23 @@
> > +/* Measure __dl_new_hash runtime
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <elf/dl-new-hash.h>
> > +#define TEST_FUNC(x, y) _dl_new_hash (x)
> > +#define TEST_NAME "_dl_new_hash"
> > +
> > +#include "bench-hash-funcs.c"
>
> Same.  OK.
>
> > diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
> > new file mode 100644
> > index 0000000000..85cf7de8bc
> > --- /dev/null
> > +++ b/benchtests/bench-hash-funcs.c
> > @@ -0,0 +1,196 @@
> > +/* Measure hash functions runtime.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#define TEST_MAIN
> > +#ifndef TEST_FUNC
> > +# error "No TEST_FUNC provided!"
> > +#endif
> > +
> > +#ifndef TEST_NAME
> > +# define STRINGIFY_PRIMITIVE(x) #  x
> > +# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
> > +
> > +# define TEST_NAME STRINGIFY (TEST_FUNC)
> > +#endif
> > +
> > +#include "json-lib.h"
> > +#include "bench-timing.h"
> > +
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +
> > +#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
> > +
> > +enum
> > +{
> > +  NFIXED_ITERS = 1048576,
> > +  NRAND_BUFS = 16384,
> > +  NRAND_ITERS = 2048,
> > +  RAND_BENCH_MAX_LEN = 256
> > +};
> > +
> > +static double __attribute__ ((noinline, noclone))
> > +do_one_test_kernel (const char *s, size_t len)
> > +{
> > +
> > +  unsigned int iters;
> > +  timing_t start, stop, cur;
> > +
> > +  /* Warmup.  */
> > +  for (iters = NFIXED_ITERS / 32; iters; --iters)
> > +    {
> > +      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
> > +    }
> > +
> > +  TIMING_NOW (start);
> > +  for (iters = NFIXED_ITERS; iters; --iters)
> > +    {
> > +      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (s, len));
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  (void) (len);
> > +  return (double) cur / (double) NFIXED_ITERS;
> > +}
> > +
> > +static void
> > +do_one_test (json_ctx_t *json_ctx, size_t len)
> > +{
> > +  char buf[len + 1];
> > +  memset (buf, -1, len);
> > +  buf[len] = '\0';
> > +
> > +  json_element_object_begin (json_ctx);
> > +
> > +  json_attr_string (json_ctx, "type", "fixed");
> > +  json_attr_uint (json_ctx, "length", len);
> > +  json_attr_double (json_ctx, "time", do_one_test_kernel (buf, len));
> > +
> > +  json_element_object_end (json_ctx);
> > +}
> > +static double
> > +do_rand_test_kernel (char const *bufs, unsigned int const *sizes)
> > +{
> > +  unsigned int i, iters;
> > +  size_t offset;
> > +  timing_t start, stop, cur;
> > +
> > +  /* Warmup.  */
> > +  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
> > +    {
> > +      DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
> > +    }
> > +
> > +  TIMING_NOW (start);
> > +  for (iters = NRAND_ITERS; iters; --iters)
> > +    {
> > +      for (i = 0, offset = 0; i < NRAND_BUFS;
> > +        ++i, offset += RAND_BENCH_MAX_LEN)
> > +     {
> > +       DO_NOT_OPTIMIZE_OUT (TEST_FUNC (bufs + offset, sizes[i]));
> > +     }
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  (void) (sizes);
> > +  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
> > +}
> > +
> > +static void __attribute__ ((noinline, noclone))
> > +do_rand_test (json_ctx_t *json_ctx)
> > +{
> > +  size_t i, sz, offset;
> > +  char *bufs;
> > +  unsigned int *sizes;
> > +
> > +  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
> > +  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
> > +  if (bufs == NULL || sizes == NULL)
> > +    {
> > +      fprintf (stderr, "Failed to allocate bufs for random test\n");
> > +      goto done;
> > +    }
> > +
> > +  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
> > +    {
> > +      json_element_object_begin (json_ctx);
> > +      json_attr_string (json_ctx, "type", "random");
> > +      json_attr_uint (json_ctx, "length", sz);
> > +
> > +      for (i = 0, offset = 0; i < NRAND_BUFS;
> > +        ++i, offset += RAND_BENCH_MAX_LEN)
> > +     {
> > +       sizes[i] = random () % sz;
> > +       memset (bufs + offset, -1, sizes[i]);
> > +       bufs[offset + sizes[i]] = '\0';
> > +     }
> > +
> > +      json_attr_double (json_ctx, "time", do_rand_test_kernel (bufs, sizes));
> > +      json_element_object_end (json_ctx);
> > +    }
> > +
> > +done:
> > +  if (bufs)
> > +    {
> > +      free (bufs);
> > +    }
> > +  if (sizes)
> > +    {
> > +      free (sizes);
> > +    }
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  int i;
> > +  json_ctx_t json_ctx;
> > +
> > +  json_init (&json_ctx, 0, stdout);
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_array_begin (&json_ctx, "results");
> > +
> > +  for (i = 0; i < 16; ++i)
> > +    {
> > +      do_one_test (&json_ctx, i);
> > +    }
> > +
> > +  for (i = 16; i <= 256; i += i)
> > +    {
> > +      do_one_test (&json_ctx, i);
> > +    }
> > +
> > +  do_rand_test (&json_ctx);
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> > +  return 0;
> > +}
>
> Please remove all redundant parantheses.  The benchmark looks OK, but
> how about also benchmarking the reference implementation in
> elf/dl-hash.h and elf/dl-new-hash.h so that we always have a comparison
> point, similar to the string benchmarks?

Added the __simple_* defs in V10. Its a bit ugly because I don't think its quite
fair to benchmark these the way we do strings because we need to keep the
dl_*hash functions inlined for a fair comparison. Have a comment expressing
that.

Also fixed all redundant parens (sorry, thats generally my stylistic preference
but shoulda gone with style of the project).
>
> > +
> > +#include <support/test-driver.c>
> > diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
> > new file mode 100644
> > index 0000000000..085e1f8ee2
> > --- /dev/null
> > +++ b/benchtests/bench-nss-hash.c
> > @@ -0,0 +1,24 @@
> > +/* Measure __nss_hash runtime
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <nss.h>
> > +#define TEST_FUNC __nss_hash
> > +
> > +uint32_t __nss_hash (const void *__key, size_t __length);
> > +
> > +#include "bench-hash-funcs.c"
>
> Reusing infrastructure.  OK.
>
> Thanks,
> Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-17  5:11     ` Siddhesh Poyarekar
@ 2022-05-18 17:34       ` Noah Goldstein
  2022-05-18 17:35         ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:34 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Tue, May 17, 2022 at 12:11 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > The prior unrolling didn't really do much as it left the dependency
> > chain between iterations. Unrolled the loop for 4 so 4x multiplies
> > could be pipelined in out-of-order machines.
> >
> > Results for __nss_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=25 runs
> > Geometric of all benchmark New / Old: 0.845
> >    type, length, New Time, Old Time, New Time / Old Time
> >   fixed,      0,    4.019,    3.729,               1.078
> >   fixed,      1,     4.95,    5.707,               0.867
> >   fixed,      2,    5.152,    5.657,               0.911
> >   fixed,      3,    4.641,    5.721,               0.811
> >   fixed,      4,    5.551,     5.81,               0.955
> >   fixed,      5,    6.525,    6.552,               0.996
> >   fixed,      6,    6.711,    6.561,               1.023
> >   fixed,      7,    6.715,    6.767,               0.992
> >   fixed,      8,    7.874,    7.915,               0.995
> >   fixed,      9,    8.888,    9.767,                0.91
> >   fixed,     10,    8.959,    9.762,               0.918
> >   fixed,     11,    9.188,    9.987,                0.92
> >   fixed,     12,    9.708,   10.618,               0.914
> >   fixed,     13,   10.393,    11.14,               0.933
> >   fixed,     14,   10.628,   12.097,               0.879
> >   fixed,     15,   10.982,   12.965,               0.847
> >   fixed,     16,   11.851,   14.429,               0.821
> >   fixed,     32,   24.334,   34.414,               0.707
> >   fixed,     64,   55.618,   86.688,               0.642
> >   fixed,    128,  118.261,   224.36,               0.527
> >   fixed,    256,  256.183,  538.629,               0.476
> > random,      2,   11.194,   11.556,               0.969
> > random,      4,   17.516,   17.205,               1.018
> > random,      8,   23.501,   20.985,                1.12
> > random,     16,   28.131,   29.212,               0.963
> > random,     32,   35.436,   38.662,               0.917
> > random,     64,    45.74,   58.868,               0.777
> > random,    128,   75.394,  121.963,               0.618
> > random,    256,  139.524,  260.726,               0.535
> > ---
> >   nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
> >   1 file changed, 42 insertions(+), 37 deletions(-)
> >
> > diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> > index 27a348ea9b..c6a375f386 100644
> > --- a/nss/nss_hash.c
> > +++ b/nss/nss_hash.c
> > @@ -19,58 +19,63 @@
> >
> >   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
> >   /*
> > - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> > + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
> >    * units.  On the first time through the loop we get the "leftover bytes"
> > - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> > - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> > - * this routine is heavily used enough, it's worth the ugly coding.
> > + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> > + * HASHC. Further unrolling does not appear to help.
> >    *
> >    * OZ's original sdbm hash
> >    */
> >   uint32_t
> >   __nss_hash (const void *keyarg, size_t len)
> >   {
> > +  enum
> > +  {
> > +    HASH_CONST_P0 = 1,              /* (uint32_t)(65599 ^ 0).  */
> > +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> > +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> > +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> > +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> > +  };
> > +
> >     const unsigned char *key;
> > -  size_t loop;
> >     uint32_t h;
> >
> > -#define HASHC   h = *key++ + 65599 * h
> > +#define HASHC        h = *key++ + HASH_CONST_P1 * h
> >
> >     h = 0;
> >     key = keyarg;
> >     if (len > 0)
> >       {
> > -      loop = (len + 8 - 1) >> 3;
> > -      switch (len & (8 - 1))
> > -        {
> > -        case 0:
> > -          do
> > -            {
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 7:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 6:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 5:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 4:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 3:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 2:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 1:
> > -              HASHC;
> > -            }
> > -       while (--loop);
> > -        }
> > +      switch ((len & (4 - 1)))
> > +     {
> > +     case 0:
> > +       /* h starts out as zero so no need to include the multiply. */
> > +       h = *key++;
> > +       /* FALLTHROUGH */
> > +     case 3:
> > +       HASHC;
> > +       /* FALLTHROUGH */
> > +     case 2:
> > +       HASHC;
> > +       /* FALLTHROUGH */
> > +     case 1:
> > +       HASHC;
> > +       /* FALLTHROUGH */
> > +     }
>
> The first 4 bytes, also sufficient for len <= 4.  OK.
>
> > +
> > +      uint32_t c0, c1, c2, c3;
> > +      for (--len; len >= 4; len -= 4)
> > +     {
> > +       c0 = (unsigned char) *(key + 0);
> > +       c1 = (unsigned char) *(key + 1);
> > +       c2 = (unsigned char) *(key + 2);
> > +       c3 = (unsigned char) *(key + 3);
> > +       h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> > +           + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> > +
> > +       key += 4;
> > +     }
>
> Remaining larger lengths.  OK.
>
> >       }
> >     return h;
> >   }
>
> TBH this wins solely on the front of the code being easier to
> understand.  The fact that it is also faster in some cases is a bonus :)
>
> LGTM.
>
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

No change to this file in V10.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-18 17:34       ` Noah Goldstein
@ 2022-05-18 17:35         ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:35 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Wed, May 18, 2022 at 12:34 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, May 17, 2022 at 12:11 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
> >
> > On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > > The prior unrolling didn't really do much as it left the dependency
> > > chain between iterations. Unrolled the loop for 4 so 4x multiplies
> > > could be pipelined in out-of-order machines.
> > >
> > > Results for __nss_hash
> > > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> > >
> > > Time as Geometric Mean of N=25 runs
> > > Geometric of all benchmark New / Old: 0.845
> > >    type, length, New Time, Old Time, New Time / Old Time
> > >   fixed,      0,    4.019,    3.729,               1.078
> > >   fixed,      1,     4.95,    5.707,               0.867
> > >   fixed,      2,    5.152,    5.657,               0.911
> > >   fixed,      3,    4.641,    5.721,               0.811
> > >   fixed,      4,    5.551,     5.81,               0.955
> > >   fixed,      5,    6.525,    6.552,               0.996
> > >   fixed,      6,    6.711,    6.561,               1.023
> > >   fixed,      7,    6.715,    6.767,               0.992
> > >   fixed,      8,    7.874,    7.915,               0.995
> > >   fixed,      9,    8.888,    9.767,                0.91
> > >   fixed,     10,    8.959,    9.762,               0.918
> > >   fixed,     11,    9.188,    9.987,                0.92
> > >   fixed,     12,    9.708,   10.618,               0.914
> > >   fixed,     13,   10.393,    11.14,               0.933
> > >   fixed,     14,   10.628,   12.097,               0.879
> > >   fixed,     15,   10.982,   12.965,               0.847
> > >   fixed,     16,   11.851,   14.429,               0.821
> > >   fixed,     32,   24.334,   34.414,               0.707
> > >   fixed,     64,   55.618,   86.688,               0.642
> > >   fixed,    128,  118.261,   224.36,               0.527
> > >   fixed,    256,  256.183,  538.629,               0.476
> > > random,      2,   11.194,   11.556,               0.969
> > > random,      4,   17.516,   17.205,               1.018
> > > random,      8,   23.501,   20.985,                1.12
> > > random,     16,   28.131,   29.212,               0.963
> > > random,     32,   35.436,   38.662,               0.917
> > > random,     64,    45.74,   58.868,               0.777
> > > random,    128,   75.394,  121.963,               0.618
> > > random,    256,  139.524,  260.726,               0.535
> > > ---
> > >   nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
> > >   1 file changed, 42 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> > > index 27a348ea9b..c6a375f386 100644
> > > --- a/nss/nss_hash.c
> > > +++ b/nss/nss_hash.c
> > > @@ -19,58 +19,63 @@
> > >
> > >   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
> > >   /*
> > > - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> > > + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
> > >    * units.  On the first time through the loop we get the "leftover bytes"
> > > - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> > > - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> > > - * this routine is heavily used enough, it's worth the ugly coding.
> > > + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> > > + * HASHC. Further unrolling does not appear to help.
> > >    *
> > >    * OZ's original sdbm hash
> > >    */
> > >   uint32_t
> > >   __nss_hash (const void *keyarg, size_t len)
> > >   {
> > > +  enum
> > > +  {
> > > +    HASH_CONST_P0 = 1,              /* (uint32_t)(65599 ^ 0).  */
> > > +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> > > +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> > > +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> > > +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> > > +  };
> > > +
> > >     const unsigned char *key;
> > > -  size_t loop;
> > >     uint32_t h;
> > >
> > > -#define HASHC   h = *key++ + 65599 * h
> > > +#define HASHC        h = *key++ + HASH_CONST_P1 * h
> > >
> > >     h = 0;
> > >     key = keyarg;
> > >     if (len > 0)
> > >       {
> > > -      loop = (len + 8 - 1) >> 3;
> > > -      switch (len & (8 - 1))
> > > -        {
> > > -        case 0:
> > > -          do
> > > -            {
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 7:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 6:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 5:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 4:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 3:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 2:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 1:
> > > -              HASHC;
> > > -            }
> > > -       while (--loop);
> > > -        }
> > > +      switch ((len & (4 - 1)))
> > > +     {
> > > +     case 0:
> > > +       /* h starts out as zero so no need to include the multiply. */
> > > +       h = *key++;
> > > +       /* FALLTHROUGH */
> > > +     case 3:
> > > +       HASHC;
> > > +       /* FALLTHROUGH */
> > > +     case 2:
> > > +       HASHC;
> > > +       /* FALLTHROUGH */
> > > +     case 1:
> > > +       HASHC;
> > > +       /* FALLTHROUGH */
> > > +     }
> >
> > The first 4 bytes, also sufficient for len <= 4.  OK.
> >
> > > +
> > > +      uint32_t c0, c1, c2, c3;
> > > +      for (--len; len >= 4; len -= 4)
> > > +     {
> > > +       c0 = (unsigned char) *(key + 0);
> > > +       c1 = (unsigned char) *(key + 1);
> > > +       c2 = (unsigned char) *(key + 2);
> > > +       c3 = (unsigned char) *(key + 3);
> > > +       h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> > > +           + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> > > +
> > > +       key += 4;
> > > +     }
> >
> > Remaining larger lengths.  OK.
> >
> > >       }
> > >     return h;
> > >   }
> >
> > TBH this wins solely on the front of the code being easier to
> > understand.  The fact that it is also faster in some cases is a bonus :)
> >
> > LGTM.
> >
> > Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
>
> No change to this file in V10.

NB: I added __simple_nss_hash to a new header file as I don't think
it's really justifiable to add a new function that can't easily be DCE
for testing/benchmarking.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-17  5:12     ` Siddhesh Poyarekar
@ 2022-05-18 17:38       ` Noah Goldstein
  2022-05-19 15:59         ` Siddhesh Poyarekar
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:38 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library, Alexander Monakov

On Tue, May 17, 2022 at 12:12 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> Not sure why, but the series failed to apply on trybot.  It's probably a
> trybot bug because it applied just fine on my up to date copy.

I think it may be related to the fact that earlier versions deleted
elf/dl-new-hash.h.


>
> On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. The unrolling allows for
> > pipelined multiplies.
> >
> > As well, as an optional sysdep, reorder the operations and prevent
> > reassosiation for better scheduling and higher ILP. This commit
>
> reassociation.  Later in the patch too.

Fixed throughout in V10.
>
> > only adds the barrier for x86, although it should be either no
> > change or a win for any architecture.
> >
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >    type, length, New Time, Old Time, New Time / Old Time
> >   fixed,      0,    2.865,     2.72,               1.053
> >   fixed,      1,    3.567,    2.489,               1.433
> >   fixed,      2,    2.577,    3.649,               0.706
> >   fixed,      3,    3.644,    5.983,               0.609
> >   fixed,      4,    4.211,    6.833,               0.616
> >   fixed,      5,    4.741,    9.372,               0.506
> >   fixed,      6,    5.415,    9.561,               0.566
> >   fixed,      7,    6.649,   10.789,               0.616
> >   fixed,      8,    8.081,   11.808,               0.684
> >   fixed,      9,    8.427,   12.935,               0.651
> >   fixed,     10,    8.673,   14.134,               0.614
> >   fixed,     11,    10.69,   15.408,               0.694
> >   fixed,     12,   10.789,   16.982,               0.635
> >   fixed,     13,   12.169,   18.411,               0.661
> >   fixed,     14,   12.659,   19.914,               0.636
> >   fixed,     15,   13.526,   21.541,               0.628
> >   fixed,     16,   14.211,   23.088,               0.616
> >   fixed,     32,   29.412,   52.722,               0.558
> >   fixed,     64,    65.41,  142.351,               0.459
> >   fixed,    128,  138.505,  295.625,               0.469
> >   fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >   sysdeps/generic/dl-new-hash.h      | 114 +++++++++++++++++++++++++++++
> >   {elf => sysdeps/x86}/dl-new-hash.h |  16 +---
>
> This breaks the benchmark build, but including just dl-new-hash.h in the
> benchmark should fix it.

Fixed in V10.
>
> >   2 files changed, 117 insertions(+), 13 deletions(-)
> >   create mode 100644 sysdeps/generic/dl-new-hash.h
> >   rename {elf => sysdeps/x86}/dl-new-hash.h (77%)
> >
> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..84aa7991a4
> > --- /dev/null
> > +++ b/sysdeps/generic/dl-new-hash.h
> > @@ -0,0 +1,114 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
>
> Same header included twice.

Missed that in V10. Will wait for feedback on the rest of the changes
and update with V11.
>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +   _dl_new_hash (const char *s)
> > +   {
> > +      uint32_t h = 5381;
> > +      for (unsigned char c = *s; c != '\0'; c = *++s)
> > +        h = h * 33 + c;
> > +      return h;
> > +   }
> > +
> > +   We can get better performance by slightly unrolling the
> > +   loop to pipeline the multiples.
>
> .. the multiples, which gcc cannot easily do due to dependencies across
> iterations.
>
> > +   As well, as an architecture specific option we add asm statements
> > +   to explicitly specifying order of operations to prevent
>
> to explicitly specify the order...

Fixed in V10.
>
> > +   reassosiation of instructions that lengthens the loop carried
> > +   dependency. This may have no affect as the compiler may have
> > +   ordered instructions the same way without it but in testing this
> > +   has not been the case for GCC. Improving GCC to reliably schedule
> > +   instructions ideally cannot be easily done.
> > +
> > +   Architecture(s) that use the reassosiation barries are:
> > +   x86
> > +
> > +   Note it is very unlikely the reassosiation barriers would
> > +   de-optimize performance on any archictecture and with an imperfect
>
> architecture

Fixed in V10.
>
> > +   compiler it may help performance, especially on out-of-order cpus,
> > +   so it is suggested that the respective maintainers add them.  */
>
> Suggest: "architecture maintainers are encouraged to benchmark this with
> __asm_reassociation_barrier defined to __asm__ like it is in x86."
>

Took your suggestion in V10.
> > +
> > +
> > +#ifndef __asm_reassociation_barrier
> > +# define __asm_reassociation_barrier(...)
> > +#endif
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *str)
> > +{
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = s[0];
> > +      /* Since hashed string is normally not empty, this is unlikely on the
> > +      first iteration of the loop.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +     return h;
> > +
> > +      c1 = s[1];
> > +      if (c1 == 0)
> > +     {
> > +       /* Ideal instruction scheduling is:
>
> Suggest: "Ideal computation order is"

Took your suggestion in V10.
>
> > +      c0 += h;
> > +      h *= 32;
> > +      h += c0;
> > +
> > +      The __asm_reassociation_barrier() macro is a sysdep optional asm
> > +      statements to prevents reassosiation that would result in more
> > +      instruction interdependencies and worse scheduling.  */
>
> This bit is redundant with the description at the top near
> __asm_reassociation_barrier.

Dropped in V10.
>
> > +       c0 += h;
> > +       __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> > +       h = h * 32 + c0;
> > +       return h;
> > +     }
> > +
> > +      /* Ideal instruction scheduling is:
>
> Same: "Ideal computation order is"

Took your suggestion in V10.
>
> > +      c1 += c0;
> > +      h *= 33 * 33;
> > +      c0 *= 32;
> > +      c1 += c0;
> > +      h  += c1;
> > +
> > +      The __asm_reassociation_barrier() macro is a sysdep optional asm
> > +      statements to prevents reassosiation that would result in more
> > +      instruction interdependencies and worse scheduling.  */
>
> This too is redundant.

Dropped in V10.
>
> > +      c1 += c0;
> > +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      __asm_reassociation_barrier("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> > +}
> > +
> > +#endif /* dl-new-hash.h */
> > diff --git a/elf/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> > similarity index 77%
> > rename from elf/dl-new-hash.h
> > rename to sysdeps/x86/dl-new-hash.h
> > index b7a91ecc07..dd800265bf 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/sysdeps/x86/dl-new-hash.h
> > @@ -19,19 +19,9 @@
> >   #ifndef _DL_NEW_HASH_H
> >   #define _DL_NEW_HASH_H 1
>
> No need to define this...

Fixed in V10.
>
> >
> > -#include <stdint.h>
> > -/* For __always_inline.  */
> > -#include <sys/cdefs.h>
> > -
> > -static __always_inline uint32_t
> > -__attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > -{
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > -}
> > +#define __asm_reassociation_barrier __asm__
> >
> > +#undef _DL_NEW_HASH_H
>
> ... if it is unconditionally undefined here.
>
> > +#include <sysdeps/generic/dl-new-hash.h>
> >
> >   #endif /* dl-new-hash.h */
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-18 17:32     ` H.J. Lu
@ 2022-05-18 17:39       ` Noah Goldstein
  2022-05-19  7:53       ` Siddhesh Poyarekar
  1 sibling, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-18 17:39 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell, Alexander Monakov

On Wed, May 18, 2022 at 12:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein
> <goldstein.w.n@gmail.com> wrote:
> >
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. The unrolling allows for
> > pipelined multiplies.
> >
> > As well, as an optional sysdep, reorder the operations and prevent
> > reassosiation for better scheduling and higher ILP. This commit
> > only adds the barrier for x86, although it should be either no
> > change or a win for any architecture.
> >
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >   type, length, New Time, Old Time, New Time / Old Time
> >  fixed,      0,    2.865,     2.72,               1.053
> >  fixed,      1,    3.567,    2.489,               1.433
> >  fixed,      2,    2.577,    3.649,               0.706
> >  fixed,      3,    3.644,    5.983,               0.609
> >  fixed,      4,    4.211,    6.833,               0.616
> >  fixed,      5,    4.741,    9.372,               0.506
> >  fixed,      6,    5.415,    9.561,               0.566
> >  fixed,      7,    6.649,   10.789,               0.616
> >  fixed,      8,    8.081,   11.808,               0.684
> >  fixed,      9,    8.427,   12.935,               0.651
> >  fixed,     10,    8.673,   14.134,               0.614
> >  fixed,     11,    10.69,   15.408,               0.694
> >  fixed,     12,   10.789,   16.982,               0.635
> >  fixed,     13,   12.169,   18.411,               0.661
> >  fixed,     14,   12.659,   19.914,               0.636
> >  fixed,     15,   13.526,   21.541,               0.628
> >  fixed,     16,   14.211,   23.088,               0.616
> >  fixed,     32,   29.412,   52.722,               0.558
> >  fixed,     64,    65.41,  142.351,               0.459
> >  fixed,    128,  138.505,  295.625,               0.469
> >  fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >  benchtests/bench-dl-new-hash.c              |   3 +-
> >  elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
> >  elf/tst-dl-hash.c                           |   1 +
> >  sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
> >  sysdeps/x86/dl-new-hash.h                   |  24 +++++
> >  5 files changed, 146 insertions(+), 13 deletions(-)
> >  rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
> >  create mode 100644 sysdeps/generic/dl-new-hash.h
> >  create mode 100644 sysdeps/x86/dl-new-hash.h
> >
> > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> > index 3c8a1d5a82..040fa7ce01 100644
> > --- a/benchtests/bench-dl-new-hash.c
> > +++ b/benchtests/bench-dl-new-hash.c
> > @@ -16,7 +16,8 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <elf/dl-new-hash.h>
> > +#include <dl-new-hash.h>
> > +#include <elf/simple-dl-new-hash.h>
> >  #define TEST_FUNC(x, y) _dl_new_hash (x)
> >  #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
> >
> > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> > similarity index 75%
> > rename from elf/dl-new-hash.h
> > rename to elf/simple-dl-new-hash.h
> > index 8641bb4196..1437b1bd36 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/simple-dl-new-hash.h
> > @@ -1,4 +1,4 @@
> > -/* _dl_new_hash for elf symbol lookup
> > +/* __simple_dl_new_hash for testing true elf symbol lookup.
> >     Copyright (C) 2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -16,16 +16,16 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#ifndef _DL_NEW_HASH_H
> > -#define _DL_NEW_HASH_H 1
> > +#ifndef _SIMPLE_DL_NEW_HASH_H
> > +#define _SIMPLE_DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > -/* For __always_inline.  */
> > -#include <sys/cdefs.h>
> >
> > -static __always_inline uint32_t
> > +/* For testing/benchmarking purposes.  Real implementation in
> > +   sysdeps/generic/dl-new-hash.h.  */
> > +static uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +__simple_dl_new_hash (const char *s)
> >  {
> >    uint32_t h = 5381;
> >    for (unsigned char c = *s; c != '\0'; c = *++s)
> > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
> >    return h;
> >  }
> >
> > -/* For testing/benchmarking purposes.  */
> > -#define __simple_dl_new_hash _dl_new_hash
> > -
> > -
> > -#endif /* dl-new-hash.h */
> > +#endif /* simple-dl-new-hash.h */
> > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> > index 8697eb73a0..b21766c63d 100644
> > --- a/elf/tst-dl-hash.c
> > +++ b/elf/tst-dl-hash.c
> > @@ -18,6 +18,7 @@
> >
> >
> >  #include <simple-dl-hash.h>
> > +#include <simple-dl-new-hash.h>
> >  #include <dl-hash.h>
> >  #include <dl-new-hash.h>
> >  #include <support/support.h>
> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..1faf309c97
> > --- /dev/null
> > +++ b/sysdeps/generic/dl-new-hash.h
> > @@ -0,0 +1,111 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +   _dl_new_hash (const char *s)
> > +   {
> > +      uint32_t h = 5381;
> > +      for (unsigned char c = *s; c != '\0'; c = *++s)
> > +        h = h * 33 + c;
> > +      return h;
> > +   }
> > +
> > +   We can get better performance by slightly unrolling the loop to
> > +   pipeline the multiples, which gcc cannot easily do due to
> > +   dependencies across iterations.
> > +
> > +   As well, as an architecture specific option we add asm statements
> > +   to explicitly specify order of operations and prevent reassociation
> > +   of instructions that lengthens the loop carried dependency. This
> > +   may have no affect as the compiler may have ordered instructions
> > +   the same way without it but in testing this has not been the case
> > +   for GCC. Improving GCC to reliably schedule instructions ideally
> > +   cannot be easily done.
> > +
> > +   Architecture(s) that use the reassociation barries are:
> > +   x86
> > +
> > +   Note it is very unlikely the reassociation barriers would
> > +   de-optimize performance on any architecture and with an imperfect
> > +   compiler it may help performance, especially on out-of-order cpus,
> > +   so it is suggested that the respective maintainers add them.
> > +
> > +   architecture maintainers are encouraged to benchmark this with
> > +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> > +*/
> > +
> > +
> > +#ifndef __asm_reassociation_barrier
> > +# define __asm_reassociation_barrier(...)
> > +#endif
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *str)
> > +{
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = s[0];
> > +      /* Since hashed string is normally not empty, this is unlikely on the
> > +        first iteration of the loop.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +       return h;
> > +
> > +      c1 = s[1];
> > +      if (c1 == 0)
> > +       {
> > +         /* Ideal computational order is:
> > +        c0 += h;
> > +        h *= 32;
> > +        h += c0;  */
> > +         c0 += h;
> > +         __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> > +         h = h * 32 + c0;
> > +         return h;
> > +       }
> > +
> > +      /* Ideal computational order is:
> > +        c1 += c0;
> > +        h *= 33 * 33;
> > +        c0 *= 32;
> > +        c1 += c0;
> > +        h  += c1;  */
> > +      c1 += c0;
> > +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      __asm_reassociation_barrier("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> > +}
> > +
> > +#endif /* dl-new-hash.h */
> > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..ce8fb5a838
> > --- /dev/null
> > +++ b/sysdeps/x86/dl-new-hash.h
> > @@ -0,0 +1,24 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifdef __asm_reassociation_barrier
> > +# error "__asm_reassociation_barrier should never already be defined."
> > +#endif
> > +
> > +#define __asm_reassociation_barrier __asm__
> > +#include <sysdeps/generic/dl-new-hash.h>
> > --
> > 2.34.1
> >
>
> Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h
> and leave the generic one unchanged?

I think the expectation is the generic code is going to be a win across
the board. The sysdep aspect of the perf were the asm barriers.

But again, I've only benchmarks on x86.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-18 17:32     ` H.J. Lu
  2022-05-18 17:39       ` Noah Goldstein
@ 2022-05-19  7:53       ` Siddhesh Poyarekar
  1 sibling, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19  7:53 UTC (permalink / raw)
  To: H.J. Lu, Noah Goldstein; +Cc: Alexander Monakov, GNU C Library

On 18/05/2022 23:02, H.J. Lu via Libc-alpha wrote:
> andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein
> <goldstein.w.n@gmail.com> wrote:
>>
>> Unroll slightly and enforce good instruction scheduling. This improves
>> performance on out-of-order machines. The unrolling allows for
>> pipelined multiplies.
>>
>> As well, as an optional sysdep, reorder the operations and prevent
>> reassosiation for better scheduling and higher ILP. This commit
>> only adds the barrier for x86, although it should be either no
>> change or a win for any architecture.
>>
>> Unrolling further started to induce slowdowns for sizes [0, 4]
>> but can help the loop so if larger sizes are the target further
>> unrolling can be beneficial.
>>
>> Results for _dl_new_hash
>> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>>
>> Time as Geometric Mean of N=30 runs
>> Geometric of all benchmark New / Old: 0.674
>>    type, length, New Time, Old Time, New Time / Old Time
>>   fixed,      0,    2.865,     2.72,               1.053
>>   fixed,      1,    3.567,    2.489,               1.433
>>   fixed,      2,    2.577,    3.649,               0.706
>>   fixed,      3,    3.644,    5.983,               0.609
>>   fixed,      4,    4.211,    6.833,               0.616
>>   fixed,      5,    4.741,    9.372,               0.506
>>   fixed,      6,    5.415,    9.561,               0.566
>>   fixed,      7,    6.649,   10.789,               0.616
>>   fixed,      8,    8.081,   11.808,               0.684
>>   fixed,      9,    8.427,   12.935,               0.651
>>   fixed,     10,    8.673,   14.134,               0.614
>>   fixed,     11,    10.69,   15.408,               0.694
>>   fixed,     12,   10.789,   16.982,               0.635
>>   fixed,     13,   12.169,   18.411,               0.661
>>   fixed,     14,   12.659,   19.914,               0.636
>>   fixed,     15,   13.526,   21.541,               0.628
>>   fixed,     16,   14.211,   23.088,               0.616
>>   fixed,     32,   29.412,   52.722,               0.558
>>   fixed,     64,    65.41,  142.351,               0.459
>>   fixed,    128,  138.505,  295.625,               0.469
>>   fixed,    256,  291.707,  601.983,               0.485
>> random,      2,   12.698,   12.849,               0.988
>> random,      4,   16.065,   15.857,               1.013
>> random,      8,   19.564,   21.105,               0.927
>> random,     16,   23.919,   26.823,               0.892
>> random,     32,   31.987,   39.591,               0.808
>> random,     64,   49.282,   71.487,               0.689
>> random,    128,    82.23,  145.364,               0.566
>> random,    256,  152.209,  298.434,                0.51
>>
>> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
>> ---
>>   benchtests/bench-dl-new-hash.c              |   3 +-
>>   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>>   elf/tst-dl-hash.c                           |   1 +
>>   sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
>>   sysdeps/x86/dl-new-hash.h                   |  24 +++++
>>   5 files changed, 146 insertions(+), 13 deletions(-)
>>   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>>   create mode 100644 sysdeps/generic/dl-new-hash.h
>>   create mode 100644 sysdeps/x86/dl-new-hash.h
>>
>> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
>> index 3c8a1d5a82..040fa7ce01 100644
>> --- a/benchtests/bench-dl-new-hash.c
>> +++ b/benchtests/bench-dl-new-hash.c
>> @@ -16,7 +16,8 @@
>>      License along with the GNU C Library; if not, see
>>      <https://www.gnu.org/licenses/>.  */
>>
>> -#include <elf/dl-new-hash.h>
>> +#include <dl-new-hash.h>
>> +#include <elf/simple-dl-new-hash.h>
>>   #define TEST_FUNC(x, y) _dl_new_hash (x)
>>   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>>
>> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
>> similarity index 75%
>> rename from elf/dl-new-hash.h
>> rename to elf/simple-dl-new-hash.h
>> index 8641bb4196..1437b1bd36 100644
>> --- a/elf/dl-new-hash.h
>> +++ b/elf/simple-dl-new-hash.h
>> @@ -1,4 +1,4 @@
>> -/* _dl_new_hash for elf symbol lookup
>> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>>      Copyright (C) 2022 Free Software Foundation, Inc.
>>      This file is part of the GNU C Library.
>>
>> @@ -16,16 +16,16 @@
>>      License along with the GNU C Library; if not, see
>>      <https://www.gnu.org/licenses/>.  */
>>
>> -#ifndef _DL_NEW_HASH_H
>> -#define _DL_NEW_HASH_H 1
>> +#ifndef _SIMPLE_DL_NEW_HASH_H
>> +#define _SIMPLE_DL_NEW_HASH_H 1
>>
>>   #include <stdint.h>
>> -/* For __always_inline.  */
>> -#include <sys/cdefs.h>
>>
>> -static __always_inline uint32_t
>> +/* For testing/benchmarking purposes.  Real implementation in
>> +   sysdeps/generic/dl-new-hash.h.  */
>> +static uint32_t
>>   __attribute__ ((unused))
>> -_dl_new_hash (const char *s)
>> +__simple_dl_new_hash (const char *s)
>>   {
>>     uint32_t h = 5381;
>>     for (unsigned char c = *s; c != '\0'; c = *++s)
>> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>>     return h;
>>   }
>>
>> -/* For testing/benchmarking purposes.  */
>> -#define __simple_dl_new_hash _dl_new_hash
>> -
>> -
>> -#endif /* dl-new-hash.h */
>> +#endif /* simple-dl-new-hash.h */
>> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
>> index 8697eb73a0..b21766c63d 100644
>> --- a/elf/tst-dl-hash.c
>> +++ b/elf/tst-dl-hash.c
>> @@ -18,6 +18,7 @@
>>
>>
>>   #include <simple-dl-hash.h>
>> +#include <simple-dl-new-hash.h>
>>   #include <dl-hash.h>
>>   #include <dl-new-hash.h>
>>   #include <support/support.h>
>> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
>> new file mode 100644
>> index 0000000000..1faf309c97
>> --- /dev/null
>> +++ b/sysdeps/generic/dl-new-hash.h
>> @@ -0,0 +1,111 @@
>> +/* _dl_new_hash for elf symbol lookup
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _DL_NEW_HASH_H
>> +#define _DL_NEW_HASH_H 1
>> +
>> +#include <stdint.h>
>> +/* For __always_inline.  */
>> +#include <sys/cdefs.h>
>> +/* For __glibc_unlikely.  */
>> +#include <sys/cdefs.h>
>> +
>> +/* The simplest implementation of _dl_new_hash is:
>> +
>> +   _dl_new_hash (const char *s)
>> +   {
>> +      uint32_t h = 5381;
>> +      for (unsigned char c = *s; c != '\0'; c = *++s)
>> +        h = h * 33 + c;
>> +      return h;
>> +   }
>> +
>> +   We can get better performance by slightly unrolling the loop to
>> +   pipeline the multiples, which gcc cannot easily do due to
>> +   dependencies across iterations.
>> +
>> +   As well, as an architecture specific option we add asm statements
>> +   to explicitly specify order of operations and prevent reassociation
>> +   of instructions that lengthens the loop carried dependency. This
>> +   may have no affect as the compiler may have ordered instructions
>> +   the same way without it but in testing this has not been the case
>> +   for GCC. Improving GCC to reliably schedule instructions ideally
>> +   cannot be easily done.
>> +
>> +   Architecture(s) that use the reassociation barries are:
>> +   x86
>> +
>> +   Note it is very unlikely the reassociation barriers would
>> +   de-optimize performance on any architecture and with an imperfect
>> +   compiler it may help performance, especially on out-of-order cpus,
>> +   so it is suggested that the respective maintainers add them.
>> +
>> +   architecture maintainers are encouraged to benchmark this with
>> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
>> +*/
>> +
>> +
>> +#ifndef __asm_reassociation_barrier
>> +# define __asm_reassociation_barrier(...)
>> +#endif
>> +
>> +static __always_inline uint32_t
>> +__attribute__ ((unused))
>> +_dl_new_hash (const char *str)
>> +{
>> +  const unsigned char *s = (const unsigned char *) str;
>> +  unsigned int h = 5381;
>> +  unsigned int c0, c1;
>> +  for (;;)
>> +    {
>> +      c0 = s[0];
>> +      /* Since hashed string is normally not empty, this is unlikely on the
>> +        first iteration of the loop.  */
>> +      if (__glibc_unlikely (c0 == 0))
>> +       return h;
>> +
>> +      c1 = s[1];
>> +      if (c1 == 0)
>> +       {
>> +         /* Ideal computational order is:
>> +        c0 += h;
>> +        h *= 32;
>> +        h += c0;  */
>> +         c0 += h;
>> +         __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
>> +         h = h * 32 + c0;
>> +         return h;
>> +       }
>> +
>> +      /* Ideal computational order is:
>> +        c1 += c0;
>> +        h *= 33 * 33;
>> +        c0 *= 32;
>> +        c1 += c0;
>> +        h  += c1;  */
>> +      c1 += c0;
>> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
>> +      h *= 33 * 33;
>> +      c1 += c0 * 32;
>> +      __asm_reassociation_barrier("" : "+r"(c1));
>> +      h += c1;
>> +      s += 2;
>> +    }
>> +}
>> +
>> +#endif /* dl-new-hash.h */
>> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
>> new file mode 100644
>> index 0000000000..ce8fb5a838
>> --- /dev/null
>> +++ b/sysdeps/x86/dl-new-hash.h
>> @@ -0,0 +1,24 @@
>> +/* _dl_new_hash for elf symbol lookup
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifdef __asm_reassociation_barrier
>> +# error "__asm_reassociation_barrier should never already be defined."
>> +#endif
>> +
>> +#define __asm_reassociation_barrier __asm__
>> +#include <sysdeps/generic/dl-new-hash.h>
>> --
>> 2.34.1
>>
> 
> Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h
> and leave the generic one unchanged?
> 

There are 3 implementations: the reference one in elf/dl-new-hash.h 
that's retained for verification, the optimized one in 
sysdeps-generic/dl-new-hash.h that is suitable for all architectures and 
the micro-optimized one with optimized schedule for x86, giving it that 
little bit more.

Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-05-18 17:26   ` [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-19 14:47   ` Siddhesh Poyarekar
  2022-05-19 14:50     ` Noah Goldstein
  5 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 14:47 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> No change to the code other than moving the function to
> dl-new-hash.h. Changed name so its now in the reserved namespace.
> ---
>   elf/dl-lookup.c   | 13 ++-----------
>   elf/dl-new-hash.h | 40 ++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 42 insertions(+), 11 deletions(-)
>   create mode 100644 elf/dl-new-hash.h
> 
> diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> index 989b073e4f..a42f6d5390 100644
> --- a/elf/dl-lookup.c
> +++ b/elf/dl-lookup.c
> @@ -24,6 +24,7 @@
>   #include <ldsodefs.h>
>   #include <dl-hash.h>
>   #include <dl-machine.h>
> +#include <dl-new-hash.h>
>   #include <dl-protected.h>
>   #include <sysdep-cancel.h>
>   #include <libc-lock.h>
> @@ -558,16 +559,6 @@ skip:
>   }
>   
>   
> -static uint32_t
> -dl_new_hash (const char *s)
> -{
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> -}
> -
> -
>   /* Add extra dependency on MAP to UNDEF_MAP.  */
>   static int
>   add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
>   		     const struct r_found_version *version,
>   		     int type_class, int flags, struct link_map *skip_map)
>   {
> -  const unsigned int new_hash = dl_new_hash (undef_name);
> +  const unsigned int new_hash = _dl_new_hash (undef_name);
>     unsigned long int old_hash = 0xffffffff;
>     struct sym_val current_value = { NULL, NULL };
>     struct r_scope_elem **scope = symbol_scope;
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> new file mode 100644
> index 0000000000..8641bb4196
> --- /dev/null
> +++ b/elf/dl-new-hash.h
> @@ -0,0 +1,40 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
> +/* For testing/benchmarking purposes.  */
> +#define __simple_dl_new_hash _dl_new_hash
> +
> +
> +#endif /* dl-new-hash.h */

Uhmm, you're going to call it __simple_dl_new_hash.  A bit roundabout, 
but OK.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-18 17:26   ` [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-19 14:49     ` Siddhesh Poyarekar
  0 siblings, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 14:49 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> If we want to further optimize the functions tests are needed.
> ---
>   elf/Makefile         |   1 +
>   elf/simple-dl-hash.h |  42 ++++++++++++++++
>   elf/tst-dl-hash.c    | 115 +++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 158 insertions(+)
>   create mode 100644 elf/simple-dl-hash.h
>   create mode 100644 elf/tst-dl-hash.c

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> 
> diff --git a/elf/Makefile b/elf/Makefile
> index ce3345ed92..adf1bcf6ce 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -312,6 +312,7 @@ tests := \
>     tst-array4 \
>     tst-array5 \
>     tst-auxv \
> +  tst-dl-hash \
>     tst-leaks1 \
>     tst-stringtable \
>     tst-tls9 \
> diff --git a/elf/simple-dl-hash.h b/elf/simple-dl-hash.h
> new file mode 100644
> index 0000000000..53702b3c55
> --- /dev/null
> +++ b/elf/simple-dl-hash.h
> @@ -0,0 +1,42 @@
> +/* __simple_dl_elf_hash for testing true elf symbol lookup.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SIMPLE_DL_ELF_HASH_H
> +#define _SIMPLE_DL_ELF_HASH_H 1
> +
> +#include <stdint.h>
> +
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-hash.h.  */
> +static uint32_t
> +__attribute__ ((unused))
> +__simple_dl_elf_hash (const char *name_arg)
> +{
> +  unsigned long int hash = 0;
> +  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
> +    {
> +      unsigned long int hi;
> +      hash = (hash << 4) + c;
> +      hi = hash & 0xf0000000;
> +      hash ^= hi >> 24;
> +      hash &= 0x0fffffff;
> +    }
> +  return hash;
> +}
> +
> +#endif /* simple-dl-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> new file mode 100644
> index 0000000000..8697eb73a0
> --- /dev/null
> +++ b/elf/tst-dl-hash.c
> @@ -0,0 +1,115 @@
> +/* Test dl-hash functions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <simple-dl-hash.h>
> +#include <dl-hash.h>
> +#include <dl-new-hash.h>
> +#include <support/support.h>
> +#include <support/check.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +typedef unsigned int (*hash_f) (const char *);
> +
> +
> +
> +static int
> +do_fill_test (size_t len, int fill, const char *name, hash_f testf,
> +	      hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  char buf[len + 1];
> +  memset (buf, fill, len);
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
> +		res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
> +		    &__simple_dl_new_hash))
> +    return 1;
> +
> +  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
> +		       &__simple_dl_elf_hash);
> +}
> +
> +static int
> +do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  size_t i;
> +  char buf[len + 1];
> +  char v;
> +  for (i = 0; i < len; ++i)
> +    {
> +      v = random ();
> +      if (v == 0)
> +	v = 1;
> +
> +      buf[i] = v;
> +    }
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_rand_tests (size_t len)
> +{
> +  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &__simple_dl_new_hash))
> +    return 1;
> +
> +  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &__simple_dl_elf_hash);
> +}
> +
> +static int
> +do_test (void)
> +{
> +  size_t i, j;
> +  for (i = 0; i < 100; ++i)
> +    {
> +      for (j = 0; j < 8192; ++j)
> +	{
> +	  if (do_rand_tests (i))
> +	    return 1;
> +
> +	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> +	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> +	    return 1;
> +	}
> +    }
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-19 14:47   ` [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
@ 2022-05-19 14:50     ` Noah Goldstein
  2022-05-19 14:56       ` Siddhesh Poyarekar
  0 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 14:50 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Thu, May 19, 2022 at 9:47 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> > No change to the code other than moving the function to
> > dl-new-hash.h. Changed name so its now in the reserved namespace.
> > ---
> >   elf/dl-lookup.c   | 13 ++-----------
> >   elf/dl-new-hash.h | 40 ++++++++++++++++++++++++++++++++++++++++
> >   2 files changed, 42 insertions(+), 11 deletions(-)
> >   create mode 100644 elf/dl-new-hash.h
> >
> > diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> > index 989b073e4f..a42f6d5390 100644
> > --- a/elf/dl-lookup.c
> > +++ b/elf/dl-lookup.c
> > @@ -24,6 +24,7 @@
> >   #include <ldsodefs.h>
> >   #include <dl-hash.h>
> >   #include <dl-machine.h>
> > +#include <dl-new-hash.h>
> >   #include <dl-protected.h>
> >   #include <sysdep-cancel.h>
> >   #include <libc-lock.h>
> > @@ -558,16 +559,6 @@ skip:
> >   }
> >
> >
> > -static uint32_t
> > -dl_new_hash (const char *s)
> > -{
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > -}
> > -
> > -
> >   /* Add extra dependency on MAP to UNDEF_MAP.  */
> >   static int
> >   add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> > @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
> >                    const struct r_found_version *version,
> >                    int type_class, int flags, struct link_map *skip_map)
> >   {
> > -  const unsigned int new_hash = dl_new_hash (undef_name);
> > +  const unsigned int new_hash = _dl_new_hash (undef_name);
> >     unsigned long int old_hash = 0xffffffff;
> >     struct sym_val current_value = { NULL, NULL };
> >     struct r_scope_elem **scope = symbol_scope;
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..8641bb4196
> > --- /dev/null
> > +++ b/elf/dl-new-hash.h
> > @@ -0,0 +1,40 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *s)
> > +{
> > +  uint32_t h = 5381;
> > +  for (unsigned char c = *s; c != '\0'; c = *++s)
> > +    h = h * 33 + c;
> > +  return h;
> > +}
> > +
> > +/* For testing/benchmarking purposes.  */
> > +#define __simple_dl_new_hash _dl_new_hash
> > +
> > +
> > +#endif /* dl-new-hash.h */
>
> Uhmm, you're going to call it __simple_dl_new_hash.  A bit roundabout,
> but OK.

It's in a header. Doesn't it need to be in reserved namespace?
>
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-19 14:50     ` Noah Goldstein
@ 2022-05-19 14:56       ` Siddhesh Poyarekar
  0 siblings, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 14:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On 19/05/2022 20:20, Noah Goldstein wrote:
> On Thu, May 19, 2022 at 9:47 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>>
>> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
>>> No change to the code other than moving the function to
>>> dl-new-hash.h. Changed name so its now in the reserved namespace.
>>> ---
>>>    elf/dl-lookup.c   | 13 ++-----------
>>>    elf/dl-new-hash.h | 40 ++++++++++++++++++++++++++++++++++++++++
>>>    2 files changed, 42 insertions(+), 11 deletions(-)
>>>    create mode 100644 elf/dl-new-hash.h
>>>
>>> diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
>>> index 989b073e4f..a42f6d5390 100644
>>> --- a/elf/dl-lookup.c
>>> +++ b/elf/dl-lookup.c
>>> @@ -24,6 +24,7 @@
>>>    #include <ldsodefs.h>
>>>    #include <dl-hash.h>
>>>    #include <dl-machine.h>
>>> +#include <dl-new-hash.h>
>>>    #include <dl-protected.h>
>>>    #include <sysdep-cancel.h>
>>>    #include <libc-lock.h>
>>> @@ -558,16 +559,6 @@ skip:
>>>    }
>>>
>>>
>>> -static uint32_t
>>> -dl_new_hash (const char *s)
>>> -{
>>> -  uint32_t h = 5381;
>>> -  for (unsigned char c = *s; c != '\0'; c = *++s)
>>> -    h = h * 33 + c;
>>> -  return h;
>>> -}
>>> -
>>> -
>>>    /* Add extra dependency on MAP to UNDEF_MAP.  */
>>>    static int
>>>    add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
>>> @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
>>>                     const struct r_found_version *version,
>>>                     int type_class, int flags, struct link_map *skip_map)
>>>    {
>>> -  const unsigned int new_hash = dl_new_hash (undef_name);
>>> +  const unsigned int new_hash = _dl_new_hash (undef_name);
>>>      unsigned long int old_hash = 0xffffffff;
>>>      struct sym_val current_value = { NULL, NULL };
>>>      struct r_scope_elem **scope = symbol_scope;
>>> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
>>> new file mode 100644
>>> index 0000000000..8641bb4196
>>> --- /dev/null
>>> +++ b/elf/dl-new-hash.h
>>> @@ -0,0 +1,40 @@
>>> +/* _dl_new_hash for elf symbol lookup
>>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#ifndef _DL_NEW_HASH_H
>>> +#define _DL_NEW_HASH_H 1
>>> +
>>> +#include <stdint.h>
>>> +/* For __always_inline.  */
>>> +#include <sys/cdefs.h>
>>> +
>>> +static __always_inline uint32_t
>>> +__attribute__ ((unused))
>>> +_dl_new_hash (const char *s)
>>> +{
>>> +  uint32_t h = 5381;
>>> +  for (unsigned char c = *s; c != '\0'; c = *++s)
>>> +    h = h * 33 + c;
>>> +  return h;
>>> +}
>>> +
>>> +/* For testing/benchmarking purposes.  */
>>> +#define __simple_dl_new_hash _dl_new_hash
>>> +
>>> +
>>> +#endif /* dl-new-hash.h */
>>
>> Uhmm, you're going to call it __simple_dl_new_hash.  A bit roundabout,
>> but OK.
> 
> It's in a header. Doesn't it need to be in reserved namespace?

Ah not that.  What I'd have done was to keep it as a real implementation 
and just have the sysdeps one override it, like some of the posix 
implementations that return ENOSYS and then implementations in sysdeps 
override them.

Then you include <elf/dl-new-hash.h> in the test case as a reference 
implementations, like the string tests do for some of the C string 
implementations.

But what you've done is also fine.

Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-18 17:26   ` [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-19 15:09     ` Siddhesh Poyarekar
  2022-05-19 15:40       ` Siddhesh Poyarekar
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 15:09 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> If we want to further optimize the function tests are needed.
> ---
>   nss/Makefile          |  1 +
>   nss/nss_hash.c        | 16 +++++++++
>   nss/simple-nss-hash.h | 42 +++++++++++++++++++++++
>   nss/tst-nss-hash.c    | 80 +++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 139 insertions(+)
>   create mode 100644 nss/simple-nss-hash.h
>   create mode 100644 nss/tst-nss-hash.c

LGTM.

Reviewed-by: Siddhesh Poyarekar

> 
> diff --git a/nss/Makefile b/nss/Makefile
> index d8b06b44fb..a978e3927a 100644
> --- a/nss/Makefile
> +++ b/nss/Makefile
> @@ -62,6 +62,7 @@ tests := \
>     test-digits-dots \
>     test-netdb \
>     tst-nss-getpwent \
> +  tst-nss-hash \
>     tst-nss-test1 \
>     tst-nss-test2 \
>     tst-nss-test4 \
> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index 27a348ea9b..f9e17d068a 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -75,4 +75,20 @@ __nss_hash (const void *keyarg, size_t len)
>     return h;
>   }
>   
> +/* For testing/benchmarking purposes. */
> +static uint32_t
> +__simple_nss_hash (const void *keyarg, size_t len)
> +{
> +  const unsigned char *key;
> +  size_t i;
> +  uint32_t h = 0;
> +  key = keyarg;
> +
> +  for (i = 0; i < len; ++i)
> +    h = *key++ + 65599 * h;
> +
> +  return h;
> +}
> +
> +
>   libc_hidden_def (__nss_hash)
> diff --git a/nss/simple-nss-hash.h b/nss/simple-nss-hash.h
> new file mode 100644
> index 0000000000..47708972e7
> --- /dev/null
> +++ b/nss/simple-nss-hash.h
> @@ -0,0 +1,42 @@
> +/* __simple_nss_hash for testing nss_hash function
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SIMPLE_NSS_HASH_H
> +#define _SIMPLE_NSS_HASH_H 1
> +
> +#include <stdint.h>
> +
> +/* For testing/benchmarking purposes.  Real implementation in
> +   nss/nss_hash.c.  */
> +static uint32_t
> +__attribute__ ((unused))
> +__simple_nss_hash (const void *keyarg, size_t len)
> +{
> +  const unsigned char *key;
> +  size_t i;
> +  uint32_t h = 0;
> +  key = keyarg;
> +
> +  for (i = 0; i < len; ++i)
> +    h = *key++ + 65599 * h;
> +
> +  return h;
> +}
> +
> +
> +#endif /* simple-nss-hash.h */
> diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
> new file mode 100644
> index 0000000000..5ec1f9b0c5
> --- /dev/null
> +++ b/nss/tst-nss-hash.c
> @@ -0,0 +1,80 @@
> +/* Test __nss_hash
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <support/support.h>
> +#include <support/check.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +#include <nss.h>
> +#include <simple-nss-hash.h>
> +
> +uint32_t __nss_hash (const void *__key, size_t __length);
> +
> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  uint32_t expec, res;
> +  char buf[len];
> +  memset (buf, fill, len);
> +
> +  expec = __simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_rand_tests (size_t len)
> +{
> +  uint32_t expec, res;
> +  size_t i;
> +  char buf[len];
> +  for (i = 0; i < len; ++i)
> +    buf[i] = random ();
> +
> +  expec = __simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: random (%zu), %x != %x\n", len, expec, res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  size_t i, j;
> +  for (i = 0; i < 100; ++i)
> +    {
> +      for (j = 0; j < 8192; ++j)
> +	{
> +	  if (do_rand_tests (i))
> +	    return 1;
> +
> +	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> +	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> +	    return 1;
> +	}
> +    }
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-18 17:26   ` [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-19 15:34     ` Siddhesh Poyarekar
  2022-05-19 22:20       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 15:34 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> Benchtests are for throughput and include random / fixed size
> benchmarks.
> ---
>   benchtests/Makefile                  |  25 ++++-
>   benchtests/README                    |   9 +-
>   benchtests/bench-dl-elf-hash.c       |  27 +++++
>   benchtests/bench-dl-new-hash.c       |  25 +++++
>   benchtests/bench-hash-funcs-kernel.h |  92 ++++++++++++++++
>   benchtests/bench-hash-funcs.c        | 152 +++++++++++++++++++++++++++
>   benchtests/bench-nss-hash.c          |  26 +++++
>   7 files changed, 348 insertions(+), 8 deletions(-)
>   create mode 100644 benchtests/bench-dl-elf-hash.c
>   create mode 100644 benchtests/bench-dl-new-hash.c
>   create mode 100644 benchtests/bench-hash-funcs-kernel.h
>   create mode 100644 benchtests/bench-hash-funcs.c
>   create mode 100644 benchtests/bench-nss-hash.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index de9de5cf58..c279041e19 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -227,6 +227,12 @@ LOCALES := \
>   include ../gen-locales.mk
>   endif
>   
> +hash-benchset := \
> +  dl-elf-hash \
> +  dl-new-hash \
> +  nss-hash \
> +# hash-benchset
> +
>   stdlib-benchset := strtod
>   
>   stdio-common-benchset := sprintf
> @@ -235,7 +241,7 @@ math-benchset := math-inlines
>   
>   ifeq (${BENCHSET},)
>   benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
> -	    $(math-benchset)
> +	    $(math-benchset) $(hash-benchset)
>   else
>   benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
>   endif
> @@ -363,9 +369,20 @@ bench-clean:
>   
>   # Validate the passed in BENCHSET
>   ifneq ($(strip ${BENCHSET}),)
> -VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
> -   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
> -   malloc-thread malloc-simple
> +VALIDBENCHSETNAMES := \
> +  bench-math \
> +  bench-pthread \
> +  bench-string \
> +  hash-benchset \
> +  malloc-simple \
> +  malloc-thread \
> +  math-benchset \
> +  stdio-common-benchset \
> +  stdlib-benchset \
> +  string-benchset \
> +  wcsmbs-benchset \
> +# VALIDBENCHSETNAMES
> +
>   INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
>   ifneq (${INVALIDBENCHSETNAMES},)
>   $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
> diff --git a/benchtests/README b/benchtests/README
> index 4d83a05b4b..998ba9b2b4 100644
> --- a/benchtests/README
> +++ b/benchtests/README
> @@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
>       bench-math
>       bench-pthread
>       bench-string
> +    hash-benchset
> +    malloc-thread
> +    math-benchset
> +    stdio-common-benchset
> +    stdlib-benchset
>       string-benchset
>       wcsmbs-benchset
> -    stdlib-benchset
> -    stdio-common-benchset
> -    math-benchset
> -    malloc-thread
>   
>   Adding a function to benchtests:
>   ===============================
> diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
> new file mode 100644
> index 0000000000..067de9fca4
> --- /dev/null
> +++ b/benchtests/bench-dl-elf-hash.c
> @@ -0,0 +1,27 @@
> +/* Measure __dl_new_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <dl-hash.h>
> +#include <elf/simple-dl-hash.h>
> +#define TEST_FUNC(x, y) _dl_elf_hash (x)
> +#define SIMPLE_TEST_FUNC(x, y) __simple_dl_elf_hash (x)
> +
> +#define TEST_NAME "_dl_elf_hash"
> +
> +
> +#include "bench-hash-funcs.c"

OK.

> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> new file mode 100644
> index 0000000000..3c8a1d5a82
> --- /dev/null
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -0,0 +1,25 @@
> +/* Measure __dl_new_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <elf/dl-new-hash.h>
> +#define TEST_FUNC(x, y) _dl_new_hash (x)
> +#define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
> +
> +#define TEST_NAME "_dl_new_hash"
> +
> +#include "bench-hash-funcs.c"

OK.

> diff --git a/benchtests/bench-hash-funcs-kernel.h b/benchtests/bench-hash-funcs-kernel.h
> new file mode 100644
> index 0000000000..9f9f245641
> --- /dev/null
> +++ b/benchtests/bench-hash-funcs-kernel.h
> @@ -0,0 +1,92 @@
> +/* Actual benchmark kernels used by bench-hash-funcs.h
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +
> +/* We go through the trouble of using macros here because many of the
> +   hash functions are meant to be inlined so its not fair to benchmark
> +   them with a function pointer where they won't be inlinable. */
> +#undef RUN_FUNC
> +#undef POSTFIX
> +#ifdef SIMPLE
> +# define RUN_FUNC SIMPLE_TEST_FUNC
> +# define POSTFIX _simple
> +#else
> +# define RUN_FUNC TEST_FUNC
> +# define POSTFIX _optimized
> +#endif
> +
> +#define PRIMITIVE_CAT(x, y) x ## y
> +#define CAT(x, y) PRIMITIVE_CAT (x, y)
> +
> +static double __attribute__ ((noinline, noclone))
> +CAT (do_one_test_kernel, POSTFIX) (const char *s, size_t len)
> +{
> +
> +  unsigned int iters;
> +  timing_t start, stop, cur;
> +
> +  /* Warmup.  */
> +  for (iters = NFIXED_ITERS / 32; iters; --iters)
> +    {

Redundant braces here and elsewhere below.

> +      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
> +    }
> +
> +  TIMING_NOW (start);
> +  for (iters = NFIXED_ITERS; iters; --iters)
> +    {
> +      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  (void) (len);
> +  return (double) cur / (double) NFIXED_ITERS;
> +}
> +
> +static double __attribute__ ((noinline, noclone))
> +CAT (do_rand_test_kernel, POSTFIX) (char const *bufs,
> +				    unsigned int const *sizes)
> +{
> +  unsigned int i, iters;
> +  size_t offset;
> +  timing_t start, stop, cur;
> +
> +  /* Warmup.  */
> +  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
> +    {
> +      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
> +    }
> +
> +  TIMING_NOW (start);
> +  for (iters = NRAND_ITERS; iters; --iters)
> +    {
> +      for (i = 0, offset = 0; i < NRAND_BUFS;
> +	   ++i, offset += RAND_BENCH_MAX_LEN)
> +	{
> +	  DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
> +	}
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  (void) (sizes);
> +  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
> +}
> diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
> new file mode 100644
> index 0000000000..3d3c736ffc
> --- /dev/null
> +++ b/benchtests/bench-hash-funcs.c
> @@ -0,0 +1,152 @@
> +/* Measure hash functions runtime.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#ifndef TEST_FUNC
> +# error "No TEST_FUNC provided!"
> +#endif
> +#ifndef SIMPLE_TEST_FUNC
> +# error "No SIMPLE_TEST_FUNC provided!"
> +#endif
> +
> +#ifndef TEST_NAME
> +# define STRINGIFY_PRIMITIVE(x) #  x
> +# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
> +
> +# define TEST_NAME STRINGIFY (TEST_FUNC)
> +#endif
> +
> +#include "json-lib.h"
> +#include "bench-timing.h"
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
> +
> +enum
> +{
> +  NFIXED_ITERS = 1048576,
> +  NRAND_BUFS = 16384,
> +  NRAND_ITERS = 2048,
> +  RAND_BENCH_MAX_LEN = 128
> +};
> +
> +#include "bench-hash-funcs-kernel.h"
> +#define SIMPLE
> +#include "bench-hash-funcs-kernel.h"
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, size_t len)
> +{
> +  char buf[len + 1];
> +  memset (buf, -1, len);
> +  buf[len] = '\0';
> +
> +  json_element_object_begin (json_ctx);
> +
> +  json_attr_string (json_ctx, "type", "fixed");
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_double (json_ctx, "time_simple", do_one_test_kernel_simple (buf, len));
> +  json_attr_double (json_ctx, "time_optimized", do_one_test_kernel_optimized (buf, len));
> +
> +  json_element_object_end (json_ctx);
> +}
> +
> +static void __attribute__ ((noinline, noclone))
> +do_rand_test (json_ctx_t *json_ctx)
> +{
> +  size_t i, sz, offset;
> +  char *bufs;
> +  unsigned int *sizes;
> +
> +  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
> +  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
> +  if (bufs == NULL || sizes == NULL)
> +    {
> +      fprintf (stderr, "Failed to allocate bufs for random test\n");
> +      goto done;
> +    }
> +
> +  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
> +    {
> +      json_element_object_begin (json_ctx);
> +      json_attr_string (json_ctx, "type", "random");
> +      json_attr_uint (json_ctx, "length", sz);
> +
> +      for (i = 0, offset = 0; i < NRAND_BUFS;
> +	   ++i, offset += RAND_BENCH_MAX_LEN)
> +	{
> +	  sizes[i] = random () % sz;
> +	  memset (bufs + offset, -1, sizes[i]);
> +	  bufs[offset + sizes[i]] = '\0';
> +	}
> +
> +      json_attr_double (json_ctx, "time_simple",
> +			do_rand_test_kernel_simple (bufs, sizes));
> +      json_attr_double (json_ctx, "time_optimized",
> +			do_rand_test_kernel_optimized (bufs, sizes));
> +      json_element_object_end (json_ctx);
> +    }
> +
> +done:
> +  if (bufs)
> +    {

Redundant braces here and multiple places in this file.

> +      free (bufs);
> +    }
> +  if (sizes)
> +    {
> +      free (sizes);
> +    }
> +}
> +
> +static int
> +do_test (void)
> +{
> +  int i;
> +  json_ctx_t json_ctx;
> +
> +  json_init (&json_ctx, 0, stdout);
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_array_begin (&json_ctx, "results");
> +
> +  for (i = 0; i < 16; ++i)
> +    {
> +      do_one_test (&json_ctx, i);
> +    }
> +
> +  for (i = 16; i <= 256; i += i)
> +    {
> +      do_one_test (&json_ctx, i);
> +    }
> +
> +  do_rand_test (&json_ctx);
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
> new file mode 100644
> index 0000000000..7e369428a2
> --- /dev/null
> +++ b/benchtests/bench-nss-hash.c
> @@ -0,0 +1,26 @@
> +/* Measure __nss_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nss.h>
> +#include <nss/simple-nss-hash.h>
> +#define TEST_FUNC __nss_hash
> +#define SIMPLE_TEST_FUNC __simple_nss_hash
> +
> +uint32_t __nss_hash (const void *__key, size_t __length);
> +
> +#include "bench-hash-funcs.c"

OK.

So just the redundant braces to fix and you should be OK.  Please send 
V11 with the change.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-19 15:09     ` Siddhesh Poyarekar
@ 2022-05-19 15:40       ` Siddhesh Poyarekar
  2022-05-19 22:20         ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 15:40 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 19/05/2022 20:39, Siddhesh Poyarekar wrote:
> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
>> If we want to further optimize the function tests are needed.
>> ---
>>   nss/Makefile          |  1 +
>>   nss/nss_hash.c        | 16 +++++++++
>>   nss/simple-nss-hash.h | 42 +++++++++++++++++++++++
>>   nss/tst-nss-hash.c    | 80 +++++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 139 insertions(+)
>>   create mode 100644 nss/simple-nss-hash.h
>>   create mode 100644 nss/tst-nss-hash.c
> 
> LGTM.
> 
> Reviewed-by: Siddhesh Poyarekar
> 
>>
>> diff --git a/nss/Makefile b/nss/Makefile
>> index d8b06b44fb..a978e3927a 100644
>> --- a/nss/Makefile
>> +++ b/nss/Makefile
>> @@ -62,6 +62,7 @@ tests := \
>>     test-digits-dots \
>>     test-netdb \
>>     tst-nss-getpwent \
>> +  tst-nss-hash \
>>     tst-nss-test1 \
>>     tst-nss-test2 \
>>     tst-nss-test4 \
>> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
>> index 27a348ea9b..f9e17d068a 100644
>> --- a/nss/nss_hash.c
>> +++ b/nss/nss_hash.c
>> @@ -75,4 +75,20 @@ __nss_hash (const void *keyarg, size_t len)
>>     return h;
>>   }
>> +/* For testing/benchmarking purposes. */
>> +static uint32_t
>> +__simple_nss_hash (const void *keyarg, size_t len)
>> +{
>> +  const unsigned char *key;
>> +  size_t i;
>> +  uint32_t h = 0;
>> +  key = keyarg;
>> +
>> +  for (i = 0; i < len; ++i)
>> +    h = *key++ + 65599 * h;
>> +
>> +  return h;
>> +}
>> +
>> +

It just struck me (while reviewing 5/6) that this is duplicated in 
simple-nss-hash.h below.  Shouldn't it be one or the other?  I know it's 
"fixed" in 5/6 but it would be nice to restructure things so that the 
tree builds at this point of the patchset too.

>>   libc_hidden_def (__nss_hash)
>> diff --git a/nss/simple-nss-hash.h b/nss/simple-nss-hash.h
>> new file mode 100644
>> index 0000000000..47708972e7
>> --- /dev/null
>> +++ b/nss/simple-nss-hash.h
>> @@ -0,0 +1,42 @@
>> +/* __simple_nss_hash for testing nss_hash function
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _SIMPLE_NSS_HASH_H
>> +#define _SIMPLE_NSS_HASH_H 1
>> +
>> +#include <stdint.h>
>> +
>> +/* For testing/benchmarking purposes.  Real implementation in
>> +   nss/nss_hash.c.  */
>> +static uint32_t
>> +__attribute__ ((unused))
>> +__simple_nss_hash (const void *keyarg, size_t len)
>> +{
>> +  const unsigned char *key;
>> +  size_t i;
>> +  uint32_t h = 0;
>> +  key = keyarg;
>> +
>> +  for (i = 0; i < len; ++i)
>> +    h = *key++ + 65599 * h;
>> +
>> +  return h;
>> +}
>> +
>> +
>> +#endif /* simple-nss-hash.h */
>> diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
>> new file mode 100644
>> index 0000000000..5ec1f9b0c5
>> --- /dev/null
>> +++ b/nss/tst-nss-hash.c
>> @@ -0,0 +1,80 @@
>> +/* Test __nss_hash
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <support/support.h>
>> +#include <support/check.h>
>> +#include <stdio.h>
>> +#include <string.h>
>> +#include <stdlib.h>
>> +#include <nss.h>
>> +#include <simple-nss-hash.h>
>> +
>> +uint32_t __nss_hash (const void *__key, size_t __length);
>> +
>> +static int
>> +do_fill_tests (size_t len, int fill)
>> +{
>> +  uint32_t expec, res;
>> +  char buf[len];
>> +  memset (buf, fill, len);
>> +
>> +  expec = __simple_nss_hash (buf, len);
>> +  res = __nss_hash (buf, len);
>> +  if (expec != res)
>> +    FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, 
>> res);
>> +
>> +  return 0;
>> +}
>> +
>> +static int
>> +do_rand_tests (size_t len)
>> +{
>> +  uint32_t expec, res;
>> +  size_t i;
>> +  char buf[len];
>> +  for (i = 0; i < len; ++i)
>> +    buf[i] = random ();
>> +
>> +  expec = __simple_nss_hash (buf, len);
>> +  res = __nss_hash (buf, len);
>> +  if (expec != res)
>> +    FAIL_EXIT1 ("FAIL: random (%zu), %x != %x\n", len, expec, res);
>> +
>> +  return 0;
>> +}
>> +
>> +static int
>> +do_test (void)
>> +{
>> +  size_t i, j;
>> +  for (i = 0; i < 100; ++i)
>> +    {
>> +      for (j = 0; j < 8192; ++j)
>> +    {
>> +      if (do_rand_tests (i))
>> +        return 1;
>> +
>> +      if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
>> +          || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
>> +        return 1;
>> +    }
>> +    }
>> +  return 0;
>> +}
>> +
>> +#include <support/test-driver.c>
> 


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-18 17:26   ` [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-19 15:41     ` Siddhesh Poyarekar
  2022-05-19 22:21       ` Noah Goldstein
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 15:41 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> The prior unrolling didn't really do much as it left the dependency
> chain between iterations. Unrolled the loop for 4 so 4x multiplies
> could be pipelined in out-of-order machines.
> 
> Results for __nss_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.845
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    4.019,    3.729,               1.078
>   fixed,      1,     4.95,    5.707,               0.867
>   fixed,      2,    5.152,    5.657,               0.911
>   fixed,      3,    4.641,    5.721,               0.811
>   fixed,      4,    5.551,     5.81,               0.955
>   fixed,      5,    6.525,    6.552,               0.996
>   fixed,      6,    6.711,    6.561,               1.023
>   fixed,      7,    6.715,    6.767,               0.992
>   fixed,      8,    7.874,    7.915,               0.995
>   fixed,      9,    8.888,    9.767,                0.91
>   fixed,     10,    8.959,    9.762,               0.918
>   fixed,     11,    9.188,    9.987,                0.92
>   fixed,     12,    9.708,   10.618,               0.914
>   fixed,     13,   10.393,    11.14,               0.933
>   fixed,     14,   10.628,   12.097,               0.879
>   fixed,     15,   10.982,   12.965,               0.847
>   fixed,     16,   11.851,   14.429,               0.821
>   fixed,     32,   24.334,   34.414,               0.707
>   fixed,     64,   55.618,   86.688,               0.642
>   fixed,    128,  118.261,   224.36,               0.527
>   fixed,    256,  256.183,  538.629,               0.476
> random,      2,   11.194,   11.556,               0.969
> random,      4,   17.516,   17.205,               1.018
> random,      8,   23.501,   20.985,                1.12
> random,     16,   28.131,   29.212,               0.963
> random,     32,   35.436,   38.662,               0.917
> random,     64,    45.74,   58.868,               0.777
> random,    128,   75.394,  121.963,               0.618
> random,    256,  139.524,  260.726,               0.535
> ---
>   nss/nss_hash.c | 92 ++++++++++++++++++++++----------------------------
>   1 file changed, 41 insertions(+), 51 deletions(-)
> 
> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index f9e17d068a..1d3787e675 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -19,74 +19,64 @@
>   
>   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
>   /*
> - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
>    * units.  On the first time through the loop we get the "leftover bytes"
> - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> - * this routine is heavily used enough, it's worth the ugly coding.
> + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> + * HASHC. Further unrolling does not appear to help.
>    *
>    * OZ's original sdbm hash
>    */
>   uint32_t
>   __nss_hash (const void *keyarg, size_t len)
>   {
> +  enum
> +  {
> +    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
> +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> +  };
> +
>     const unsigned char *key;
> -  size_t loop;
>     uint32_t h;
>   
> -#define HASHC   h = *key++ + 65599 * h
> +#define HASHC	h = *key++ + HASH_CONST_P1 * h
>   
>     h = 0;
>     key = keyarg;
>     if (len > 0)
>       {
> -      loop = (len + 8 - 1) >> 3;
> -      switch (len & (8 - 1))
> -        {
> -        case 0:
> -          do
> -            {
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 7:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 6:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 5:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 4:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 3:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 2:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 1:
> -              HASHC;
> -            }
> -	  while (--loop);
> -        }
> -    }
> -  return h;
> -}
> +      switch ((len & (4 - 1)))
> +	{
> +	case 0:
> +	  /* h starts out as zero so no need to include the multiply. */
> +	  h = *key++;
> +	  /* FALLTHROUGH */
> +	case 3:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 2:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 1:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	}
>   
> -/* For testing/benchmarking purposes. */
> -static uint32_t
> -__simple_nss_hash (const void *keyarg, size_t len)
> -{
> -  const unsigned char *key;
> -  size_t i;
> -  uint32_t h = 0;
> -  key = keyarg;
> -
> -  for (i = 0; i < len; ++i)
> -    h = *key++ + 65599 * h;
> +      uint32_t c0, c1, c2, c3;
> +      for (--len; len >= 4; len -= 4)
> +	{
> +	  c0 = (unsigned char) *(key + 0);
> +	  c1 = (unsigned char) *(key + 1);
> +	  c2 = (unsigned char) *(key + 2);
> +	  c3 = (unsigned char) *(key + 3);
> +	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> +	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
>   
> +	  key += 4;
> +	}
> +    }
>     return h;
>   }
>   

This bottom bit should get dropped in 3/6 (and not have 
__simple_nss_hash in two places) and then reintroduced here.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-18 17:26   ` [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-05-18 17:32     ` H.J. Lu
@ 2022-05-19 15:55     ` Siddhesh Poyarekar
  2022-05-19 22:22       ` Noah Goldstein
  1 sibling, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 15:55 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha; +Cc: Alexander Monakov

On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. The unrolling allows for
> pipelined multiplies.
> 
> As well, as an optional sysdep, reorder the operations and prevent
> reassosiation for better scheduling and higher ILP. This commit
> only adds the barrier for x86, although it should be either no
> change or a win for any architecture.
> 
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    2.865,     2.72,               1.053
>   fixed,      1,    3.567,    2.489,               1.433
>   fixed,      2,    2.577,    3.649,               0.706
>   fixed,      3,    3.644,    5.983,               0.609
>   fixed,      4,    4.211,    6.833,               0.616
>   fixed,      5,    4.741,    9.372,               0.506
>   fixed,      6,    5.415,    9.561,               0.566
>   fixed,      7,    6.649,   10.789,               0.616
>   fixed,      8,    8.081,   11.808,               0.684
>   fixed,      9,    8.427,   12.935,               0.651
>   fixed,     10,    8.673,   14.134,               0.614
>   fixed,     11,    10.69,   15.408,               0.694
>   fixed,     12,   10.789,   16.982,               0.635
>   fixed,     13,   12.169,   18.411,               0.661
>   fixed,     14,   12.659,   19.914,               0.636
>   fixed,     15,   13.526,   21.541,               0.628
>   fixed,     16,   14.211,   23.088,               0.616
>   fixed,     32,   29.412,   52.722,               0.558
>   fixed,     64,    65.41,  142.351,               0.459
>   fixed,    128,  138.505,  295.625,               0.469
>   fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>   benchtests/bench-dl-new-hash.c              |   3 +-
>   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>   elf/tst-dl-hash.c                           |   1 +
>   sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
>   sysdeps/x86/dl-new-hash.h                   |  24 +++++
>   5 files changed, 146 insertions(+), 13 deletions(-)
>   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>   create mode 100644 sysdeps/generic/dl-new-hash.h
>   create mode 100644 sysdeps/x86/dl-new-hash.h

Mostly OK, just minor nits to fix below.

> 
> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> index 3c8a1d5a82..040fa7ce01 100644
> --- a/benchtests/bench-dl-new-hash.c
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -16,7 +16,8 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#include <elf/dl-new-hash.h>
> +#include <dl-new-hash.h>
> +#include <elf/simple-dl-new-hash.h>
>   #define TEST_FUNC(x, y) _dl_new_hash (x)
>   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)

OK.

>   
> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> similarity index 75%
> rename from elf/dl-new-hash.h
> rename to elf/simple-dl-new-hash.h
> index 8641bb4196..1437b1bd36 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/simple-dl-new-hash.h
> @@ -1,4 +1,4 @@
> -/* _dl_new_hash for elf symbol lookup
> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>      Copyright (C) 2022 Free Software Foundation, Inc.
>      This file is part of the GNU C Library.
>   
> @@ -16,16 +16,16 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#ifndef _DL_NEW_HASH_H
> -#define _DL_NEW_HASH_H 1
> +#ifndef _SIMPLE_DL_NEW_HASH_H
> +#define _SIMPLE_DL_NEW_HASH_H 1
>   
>   #include <stdint.h>
> -/* For __always_inline.  */
> -#include <sys/cdefs.h>
>   
> -static __always_inline uint32_t
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-new-hash.h.  */
> +static uint32_t
>   __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +__simple_dl_new_hash (const char *s)
>   {
>     uint32_t h = 5381;
>     for (unsigned char c = *s; c != '\0'; c = *++s)
> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>     return h;
>   }
>   
> -/* For testing/benchmarking purposes.  */
> -#define __simple_dl_new_hash _dl_new_hash
> -
> -
> -#endif /* dl-new-hash.h */
> +#endif /* simple-dl-new-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> index 8697eb73a0..b21766c63d 100644
> --- a/elf/tst-dl-hash.c
> +++ b/elf/tst-dl-hash.c
> @@ -18,6 +18,7 @@
>   
>   
>   #include <simple-dl-hash.h>
> +#include <simple-dl-new-hash.h>
>   #include <dl-hash.h>
>   #include <dl-new-hash.h>
>   #include <support/support.h>
> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> new file mode 100644
> index 0000000000..1faf309c97
> --- /dev/null
> +++ b/sysdeps/generic/dl-new-hash.h
> @@ -0,0 +1,111 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>

Duplicate, but you already know this.

> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +   _dl_new_hash (const char *s)
> +   {
> +      uint32_t h = 5381;
> +      for (unsigned char c = *s; c != '\0'; c = *++s)
> +        h = h * 33 + c;
> +      return h;
> +   }
> +
> +   We can get better performance by slightly unrolling the loop to
> +   pipeline the multiples, which gcc cannot easily do due to
> +   dependencies across iterations.
> +
> +   As well, as an architecture specific option we add asm statements
> +   to explicitly specify order of operations and prevent reassociation
> +   of instructions that lengthens the loop carried dependency. This
> +   may have no affect as the compiler may have ordered instructions
> +   the same way without it but in testing this has not been the case
> +   for GCC. Improving GCC to reliably schedule instructions ideally
> +   cannot be easily done.
> +
> +   Architecture(s) that use the reassociation barries are:

barriers

> +   x86
> +
> +   Note it is very unlikely the reassociation barriers would
> +   de-optimize performance on any architecture and with an imperfect
> +   compiler it may help performance, especially on out-of-order cpus,
> +   so it is suggested that the respective maintainers add them.
> +
> +   architecture maintainers are encouraged to benchmark this with

Architecture

> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> +*/
> +
> +
> +#ifndef __asm_reassociation_barrier
> +# define __asm_reassociation_barrier(...)
> +#endif
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *str)
> +{
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Since hashed string is normally not empty, this is unlikely on the
> +	 first iteration of the loop.  */
> +      if (__glibc_unlikely (c0 == 0))
> +	return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +	{
> +	  /* Ideal computational order is:
> +	 c0 += h;
> +	 h *= 32;
> +	 h += c0;  */
> +	  c0 += h;
> +	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> +	  h = h * 32 + c0;
> +	  return h;
> +	}
> +
> +      /* Ideal computational order is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;  */
> +      c1 += c0;
> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      __asm_reassociation_barrier("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
> +}
> +

OK.

> +#endif /* dl-new-hash.h */
> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> new file mode 100644
> index 0000000000..ce8fb5a838
> --- /dev/null
> +++ b/sysdeps/x86/dl-new-hash.h
> @@ -0,0 +1,24 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifdef __asm_reassociation_barrier
> +# error "__asm_reassociation_barrier should never already be defined."
> +#endif
> +
> +#define __asm_reassociation_barrier __asm__
> +#include <sysdeps/generic/dl-new-hash.h>

OK.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-18 17:38       ` Noah Goldstein
@ 2022-05-19 15:59         ` Siddhesh Poyarekar
  2022-05-19 16:54           ` DJ Delorie
  0 siblings, 1 reply; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-19 15:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Alexander Monakov, DJ Delorie

On 18/05/2022 23:08, Noah Goldstein wrote:
> On Tue, May 17, 2022 at 12:12 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>>
>> Not sure why, but the series failed to apply on trybot.  It's probably a
>> trybot bug because it applied just fine on my up to date copy.
> 
> I think it may be related to the fact that earlier versions deleted
> elf/dl-new-hash.h.
> 
> 

Yeah I think trybot is tripping on a git-style diff with renames, which 
patch by itself may not be able to parse.  DJ, is this something we 
could fix in trybot?

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-19 15:59         ` Siddhesh Poyarekar
@ 2022-05-19 16:54           ` DJ Delorie
  0 siblings, 0 replies; 167+ messages in thread
From: DJ Delorie @ 2022-05-19 16:54 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: goldstein.w.n, libc-alpha, amonakov

Siddhesh Poyarekar <siddhesh@gotplt.org> writes:
> Yeah I think trybot is tripping on a git-style diff with renames, which 
> patch by itself may not be able to parse.  DJ, is this something we 
> could fix in trybot?

We changed the apply-patch trybot to use "git apply", we can do the same
for the 32bit trybot.


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v11 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
                   ` (16 preceding siblings ...)
  2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
@ 2022-05-19 22:17 ` Noah Goldstein
  2022-05-19 22:17   ` [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
                     ` (5 more replies)
  17 siblings, 6 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:17 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Siddhesh Poyarekar

No change to the code other than moving the function to
dl-new-hash.h. Changed name so its now in the reserved namespace.
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
---
 elf/dl-lookup.c   | 13 ++-----------
 elf/dl-new-hash.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 11 deletions(-)
 create mode 100644 elf/dl-new-hash.h

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 989b073e4f..a42f6d5390 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -24,6 +24,7 @@
 #include <ldsodefs.h>
 #include <dl-hash.h>
 #include <dl-machine.h>
+#include <dl-new-hash.h>
 #include <dl-protected.h>
 #include <sysdep-cancel.h>
 #include <libc-lock.h>
@@ -558,16 +559,6 @@ skip:
 }
 
 
-static uint32_t
-dl_new_hash (const char *s)
-{
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
-}
-
-
 /* Add extra dependency on MAP to UNDEF_MAP.  */
 static int
 add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
@@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
 		     const struct r_found_version *version,
 		     int type_class, int flags, struct link_map *skip_map)
 {
-  const unsigned int new_hash = dl_new_hash (undef_name);
+  const unsigned int new_hash = _dl_new_hash (undef_name);
   unsigned long int old_hash = 0xffffffff;
   struct sym_val current_value = { NULL, NULL };
   struct r_scope_elem **scope = symbol_scope;
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
new file mode 100644
index 0000000000..8641bb4196
--- /dev/null
+++ b/elf/dl-new-hash.h
@@ -0,0 +1,40 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline.  */
+#include <sys/cdefs.h>
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *s)
+{
+  uint32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h;
+}
+
+/* For testing/benchmarking purposes.  */
+#define __simple_dl_new_hash _dl_new_hash
+
+
+#endif /* dl-new-hash.h */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
@ 2022-05-19 22:17   ` Noah Goldstein
  2022-05-19 22:19     ` Noah Goldstein
  2022-05-19 22:18   ` [PATCH v11 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:17 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Siddhesh Poyarekar

If we want to further optimize the functions tests are needed.
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
---
 elf/Makefile         |   1 +
 elf/simple-dl-hash.h |  42 ++++++++++++++++
 elf/tst-dl-hash.c    | 115 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+)
 create mode 100644 elf/simple-dl-hash.h
 create mode 100644 elf/tst-dl-hash.c

diff --git a/elf/Makefile b/elf/Makefile
index 6f4ea78007..838fe39afb 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -312,6 +312,7 @@ tests := \
   tst-array4 \
   tst-array5 \
   tst-auxv \
+  tst-dl-hash \
   tst-leaks1 \
   tst-stringtable \
   tst-tls9 \
diff --git a/elf/simple-dl-hash.h b/elf/simple-dl-hash.h
new file mode 100644
index 0000000000..53702b3c55
--- /dev/null
+++ b/elf/simple-dl-hash.h
@@ -0,0 +1,42 @@
+/* __simple_dl_elf_hash for testing true elf symbol lookup.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SIMPLE_DL_ELF_HASH_H
+#define _SIMPLE_DL_ELF_HASH_H 1
+
+#include <stdint.h>
+
+/* For testing/benchmarking purposes.  Real implementation in
+   sysdeps/generic/dl-hash.h.  */
+static uint32_t
+__attribute__ ((unused))
+__simple_dl_elf_hash (const char *name_arg)
+{
+  unsigned long int hash = 0;
+  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
+    {
+      unsigned long int hi;
+      hash = (hash << 4) + c;
+      hi = hash & 0xf0000000;
+      hash ^= hi >> 24;
+      hash &= 0x0fffffff;
+    }
+  return hash;
+}
+
+#endif /* simple-dl-hash.h */
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
new file mode 100644
index 0000000000..8697eb73a0
--- /dev/null
+++ b/elf/tst-dl-hash.c
@@ -0,0 +1,115 @@
+/* Test dl-hash functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <simple-dl-hash.h>
+#include <dl-hash.h>
+#include <dl-new-hash.h>
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef unsigned int (*hash_f) (const char *);
+
+
+
+static int
+do_fill_test (size_t len, int fill, const char *name, hash_f testf,
+	      hash_f expecf)
+{
+  uint32_t expec, res;
+  char buf[len + 1];
+  memset (buf, fill, len);
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
+		res);
+
+  return 0;
+}
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
+		    &__simple_dl_new_hash))
+    return 1;
+
+  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
+		       &__simple_dl_elf_hash);
+}
+
+static int
+do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len + 1];
+  char v;
+  for (i = 0; i < len; ++i)
+    {
+      v = random ();
+      if (v == 0)
+	v = 1;
+
+      buf[i] = v;
+    }
+  buf[len] = '\0';
+
+  expec = expecf (buf);
+  res = testf (buf);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &__simple_dl_new_hash))
+    return 1;
+
+  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &__simple_dl_elf_hash);
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    return 1;
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    return 1;
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v11 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
  2022-05-19 22:17   ` [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-19 22:18   ` Noah Goldstein
  2022-05-23  7:42     ` Siddhesh Poyarekar
  2022-05-19 22:18   ` [PATCH v11 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:18 UTC (permalink / raw)
  To: libc-alpha

If we want to further optimize the function tests are needed.
---
 nss/Makefile          |  1 +
 nss/nss_hash.c        |  1 +
 nss/simple-nss-hash.h | 42 +++++++++++++++++++++++
 nss/tst-nss-hash.c    | 80 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+)
 create mode 100644 nss/simple-nss-hash.h
 create mode 100644 nss/tst-nss-hash.c

diff --git a/nss/Makefile b/nss/Makefile
index d8b06b44fb..a978e3927a 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -62,6 +62,7 @@ tests := \
   test-digits-dots \
   test-netdb \
   tst-nss-getpwent \
+  tst-nss-hash \
   tst-nss-test1 \
   tst-nss-test2 \
   tst-nss-test4 \
diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 27a348ea9b..3d8e4cf37e 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -75,4 +75,5 @@ __nss_hash (const void *keyarg, size_t len)
   return h;
 }
 
+
 libc_hidden_def (__nss_hash)
diff --git a/nss/simple-nss-hash.h b/nss/simple-nss-hash.h
new file mode 100644
index 0000000000..47708972e7
--- /dev/null
+++ b/nss/simple-nss-hash.h
@@ -0,0 +1,42 @@
+/* __simple_nss_hash for testing nss_hash function
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SIMPLE_NSS_HASH_H
+#define _SIMPLE_NSS_HASH_H 1
+
+#include <stdint.h>
+
+/* For testing/benchmarking purposes.  Real implementation in
+   nss/nss_hash.c.  */
+static uint32_t
+__attribute__ ((unused))
+__simple_nss_hash (const void *keyarg, size_t len)
+{
+  const unsigned char *key;
+  size_t i;
+  uint32_t h = 0;
+  key = keyarg;
+
+  for (i = 0; i < len; ++i)
+    h = *key++ + 65599 * h;
+
+  return h;
+}
+
+
+#endif /* simple-nss-hash.h */
diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
new file mode 100644
index 0000000000..5ec1f9b0c5
--- /dev/null
+++ b/nss/tst-nss-hash.c
@@ -0,0 +1,80 @@
+/* Test __nss_hash
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+#include <support/check.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <nss.h>
+#include <simple-nss-hash.h>
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+static int
+do_fill_tests (size_t len, int fill)
+{
+  uint32_t expec, res;
+  char buf[len];
+  memset (buf, fill, len);
+
+  expec = __simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
+
+  return 0;
+}
+
+static int
+do_rand_tests (size_t len)
+{
+  uint32_t expec, res;
+  size_t i;
+  char buf[len];
+  for (i = 0; i < len; ++i)
+    buf[i] = random ();
+
+  expec = __simple_nss_hash (buf, len);
+  res = __nss_hash (buf, len);
+  if (expec != res)
+    FAIL_EXIT1 ("FAIL: random (%zu), %x != %x\n", len, expec, res);
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  size_t i, j;
+  for (i = 0; i < 100; ++i)
+    {
+      for (j = 0; j < 8192; ++j)
+	{
+	  if (do_rand_tests (i))
+	    return 1;
+
+	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
+	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
+	    return 1;
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v11 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
  2022-05-19 22:17   ` [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
  2022-05-19 22:18   ` [PATCH v11 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-19 22:18   ` Noah Goldstein
  2022-05-23  7:44     ` Siddhesh Poyarekar
  2022-05-19 22:18   ` [PATCH v11 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:18 UTC (permalink / raw)
  To: libc-alpha

Benchtests are for throughput and include random / fixed size
benchmarks.
---
 benchtests/Makefile                  |  25 ++++-
 benchtests/README                    |   9 +-
 benchtests/bench-dl-elf-hash.c       |  27 +++++
 benchtests/bench-dl-new-hash.c       |  25 +++++
 benchtests/bench-hash-funcs-kernel.h |  86 ++++++++++++++++
 benchtests/bench-hash-funcs.c        | 145 +++++++++++++++++++++++++++
 benchtests/bench-nss-hash.c          |  26 +++++
 7 files changed, 335 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-dl-elf-hash.c
 create mode 100644 benchtests/bench-dl-new-hash.c
 create mode 100644 benchtests/bench-hash-funcs-kernel.h
 create mode 100644 benchtests/bench-hash-funcs.c
 create mode 100644 benchtests/bench-nss-hash.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index de9de5cf58..c279041e19 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -227,6 +227,12 @@ LOCALES := \
 include ../gen-locales.mk
 endif
 
+hash-benchset := \
+  dl-elf-hash \
+  dl-new-hash \
+  nss-hash \
+# hash-benchset
+
 stdlib-benchset := strtod
 
 stdio-common-benchset := sprintf
@@ -235,7 +241,7 @@ math-benchset := math-inlines
 
 ifeq (${BENCHSET},)
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(hash-benchset)
 else
 benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
 endif
@@ -363,9 +369,20 @@ bench-clean:
 
 # Validate the passed in BENCHSET
 ifneq ($(strip ${BENCHSET}),)
-VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
-   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread malloc-simple
+VALIDBENCHSETNAMES := \
+  bench-math \
+  bench-pthread \
+  bench-string \
+  hash-benchset \
+  malloc-simple \
+  malloc-thread \
+  math-benchset \
+  stdio-common-benchset \
+  stdlib-benchset \
+  string-benchset \
+  wcsmbs-benchset \
+# VALIDBENCHSETNAMES
+
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
diff --git a/benchtests/README b/benchtests/README
index 4d83a05b4b..998ba9b2b4 100644
--- a/benchtests/README
+++ b/benchtests/README
@@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
     bench-math
     bench-pthread
     bench-string
+    hash-benchset
+    malloc-thread
+    math-benchset
+    stdio-common-benchset
+    stdlib-benchset
     string-benchset
     wcsmbs-benchset
-    stdlib-benchset
-    stdio-common-benchset
-    math-benchset
-    malloc-thread
 
 Adding a function to benchtests:
 ===============================
diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
new file mode 100644
index 0000000000..067de9fca4
--- /dev/null
+++ b/benchtests/bench-dl-elf-hash.c
@@ -0,0 +1,27 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-hash.h>
+#include <elf/simple-dl-hash.h>
+#define TEST_FUNC(x, y) _dl_elf_hash (x)
+#define SIMPLE_TEST_FUNC(x, y) __simple_dl_elf_hash (x)
+
+#define TEST_NAME "_dl_elf_hash"
+
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
new file mode 100644
index 0000000000..3c8a1d5a82
--- /dev/null
+++ b/benchtests/bench-dl-new-hash.c
@@ -0,0 +1,25 @@
+/* Measure __dl_new_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf/dl-new-hash.h>
+#define TEST_FUNC(x, y) _dl_new_hash (x)
+#define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
+
+#define TEST_NAME "_dl_new_hash"
+
+#include "bench-hash-funcs.c"
diff --git a/benchtests/bench-hash-funcs-kernel.h b/benchtests/bench-hash-funcs-kernel.h
new file mode 100644
index 0000000000..83995cc0ae
--- /dev/null
+++ b/benchtests/bench-hash-funcs-kernel.h
@@ -0,0 +1,86 @@
+/* Actual benchmark kernels used by bench-hash-funcs.h
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+
+/* We go through the trouble of using macros here because many of the
+   hash functions are meant to be inlined so its not fair to benchmark
+   them with a function pointer where they won't be inlinable. */
+#undef RUN_FUNC
+#undef POSTFIX
+#ifdef SIMPLE
+# define RUN_FUNC SIMPLE_TEST_FUNC
+# define POSTFIX _simple
+#else
+# define RUN_FUNC TEST_FUNC
+# define POSTFIX _optimized
+#endif
+
+#define PRIMITIVE_CAT(x, y) x ## y
+#define CAT(x, y) PRIMITIVE_CAT (x, y)
+
+static double __attribute__ ((noinline, noclone))
+CAT (do_one_test_kernel, POSTFIX) (const char *s, size_t len)
+{
+
+  unsigned int iters;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (iters = NFIXED_ITERS / 32; iters; --iters)
+    DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
+
+  TIMING_NOW (start);
+  for (iters = NFIXED_ITERS; iters; --iters)
+    DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (len);
+  return (double) cur / (double) NFIXED_ITERS;
+}
+
+static double __attribute__ ((noinline, noclone))
+CAT (do_rand_test_kernel, POSTFIX) (char const *bufs,
+				    unsigned int const *sizes)
+{
+  unsigned int i, iters;
+  size_t offset;
+  timing_t start, stop, cur;
+
+  /* Warmup.  */
+  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
+    DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
+
+  TIMING_NOW (start);
+  for (iters = NRAND_ITERS; iters; --iters)
+    {
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
+
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  (void) (sizes);
+  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
+}
diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
new file mode 100644
index 0000000000..578c5cbae2
--- /dev/null
+++ b/benchtests/bench-hash-funcs.c
@@ -0,0 +1,145 @@
+/* Measure hash functions runtime.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef TEST_FUNC
+# error "No TEST_FUNC provided!"
+#endif
+#ifndef SIMPLE_TEST_FUNC
+# error "No SIMPLE_TEST_FUNC provided!"
+#endif
+
+#ifndef TEST_NAME
+# define STRINGIFY_PRIMITIVE(x) #  x
+# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
+
+# define TEST_NAME STRINGIFY (TEST_FUNC)
+#endif
+
+#include "json-lib.h"
+#include "bench-timing.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
+
+enum
+{
+  NFIXED_ITERS = 1048576,
+  NRAND_BUFS = 16384,
+  NRAND_ITERS = 2048,
+  RAND_BENCH_MAX_LEN = 128
+};
+
+#include "bench-hash-funcs-kernel.h"
+#define SIMPLE
+#include "bench-hash-funcs-kernel.h"
+
+static void
+do_one_test (json_ctx_t *json_ctx, size_t len)
+{
+  char buf[len + 1];
+  memset (buf, -1, len);
+  buf[len] = '\0';
+
+  json_element_object_begin (json_ctx);
+
+  json_attr_string (json_ctx, "type", "fixed");
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_double (json_ctx, "time_simple", do_one_test_kernel_simple (buf, len));
+  json_attr_double (json_ctx, "time_optimized", do_one_test_kernel_optimized (buf, len));
+
+  json_element_object_end (json_ctx);
+}
+
+static void __attribute__ ((noinline, noclone))
+do_rand_test (json_ctx_t *json_ctx)
+{
+  size_t i, sz, offset;
+  char *bufs;
+  unsigned int *sizes;
+
+  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
+  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
+  if (bufs == NULL || sizes == NULL)
+    {
+      fprintf (stderr, "Failed to allocate bufs for random test\n");
+      goto done;
+    }
+
+  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_string (json_ctx, "type", "random");
+      json_attr_uint (json_ctx, "length", sz);
+
+      for (i = 0, offset = 0; i < NRAND_BUFS;
+	   ++i, offset += RAND_BENCH_MAX_LEN)
+	{
+	  sizes[i] = random () % sz;
+	  memset (bufs + offset, -1, sizes[i]);
+	  bufs[offset + sizes[i]] = '\0';
+	}
+
+      json_attr_double (json_ctx, "time_simple",
+			do_rand_test_kernel_simple (bufs, sizes));
+      json_attr_double (json_ctx, "time_optimized",
+			do_rand_test_kernel_optimized (bufs, sizes));
+      json_element_object_end (json_ctx);
+    }
+
+done:
+  if (bufs)
+    free (bufs);
+
+  if (sizes)
+    free (sizes);
+}
+
+static int
+do_test (void)
+{
+  int i;
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_array_begin (&json_ctx, "results");
+
+  for (i = 0; i < 16; ++i)
+    do_one_test (&json_ctx, i);
+
+  for (i = 16; i <= 256; i += i)
+    do_one_test (&json_ctx, i);
+
+  do_rand_test (&json_ctx);
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
new file mode 100644
index 0000000000..7e369428a2
--- /dev/null
+++ b/benchtests/bench-nss-hash.c
@@ -0,0 +1,26 @@
+/* Measure __nss_hash runtime
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#include <nss/simple-nss-hash.h>
+#define TEST_FUNC __nss_hash
+#define SIMPLE_TEST_FUNC __simple_nss_hash
+
+uint32_t __nss_hash (const void *__key, size_t __length);
+
+#include "bench-hash-funcs.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v11 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-05-19 22:18   ` [PATCH v11 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-19 22:18   ` Noah Goldstein
  2022-05-23  7:44     ` Siddhesh Poyarekar
  2022-05-19 22:18   ` [PATCH v11 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
  2022-05-19 22:18   ` [PATCH v11 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:18 UTC (permalink / raw)
  To: libc-alpha

The prior unrolling didn't really do much as it left the dependency
chain between iterations. Unrolled the loop for 4 so 4x multiplies
could be pipelined in out-of-order machines.

Results for __nss_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.845
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    4.019,    3.729,               1.078
 fixed,      1,     4.95,    5.707,               0.867
 fixed,      2,    5.152,    5.657,               0.911
 fixed,      3,    4.641,    5.721,               0.811
 fixed,      4,    5.551,     5.81,               0.955
 fixed,      5,    6.525,    6.552,               0.996
 fixed,      6,    6.711,    6.561,               1.023
 fixed,      7,    6.715,    6.767,               0.992
 fixed,      8,    7.874,    7.915,               0.995
 fixed,      9,    8.888,    9.767,                0.91
 fixed,     10,    8.959,    9.762,               0.918
 fixed,     11,    9.188,    9.987,                0.92
 fixed,     12,    9.708,   10.618,               0.914
 fixed,     13,   10.393,    11.14,               0.933
 fixed,     14,   10.628,   12.097,               0.879
 fixed,     15,   10.982,   12.965,               0.847
 fixed,     16,   11.851,   14.429,               0.821
 fixed,     32,   24.334,   34.414,               0.707
 fixed,     64,   55.618,   86.688,               0.642
 fixed,    128,  118.261,   224.36,               0.527
 fixed,    256,  256.183,  538.629,               0.476
random,      2,   11.194,   11.556,               0.969
random,      4,   17.516,   17.205,               1.018
random,      8,   23.501,   20.985,                1.12
random,     16,   28.131,   29.212,               0.963
random,     32,   35.436,   38.662,               0.917
random,     64,    45.74,   58.868,               0.777
random,    128,   75.394,  121.963,               0.618
random,    256,  139.524,  260.726,               0.535
---
 nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/nss/nss_hash.c b/nss/nss_hash.c
index 3d8e4cf37e..1d3787e675 100644
--- a/nss/nss_hash.c
+++ b/nss/nss_hash.c
@@ -19,58 +19,63 @@
 
 /* This is from libc/db/hash/hash_func.c, hash3 is static there */
 /*
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
  * units.  On the first time through the loop we get the "leftover bytes"
- * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
+ * (len % 4).  On every other iteration, we perform a 4x unrolled version
+ * HASHC. Further unrolling does not appear to help.
  *
  * OZ's original sdbm hash
  */
 uint32_t
 __nss_hash (const void *keyarg, size_t len)
 {
+  enum
+  {
+    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
+    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
+    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
+    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
+    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
+  };
+
   const unsigned char *key;
-  size_t loop;
   uint32_t h;
 
-#define HASHC   h = *key++ + 65599 * h
+#define HASHC	h = *key++ + HASH_CONST_P1 * h
 
   h = 0;
   key = keyarg;
   if (len > 0)
     {
-      loop = (len + 8 - 1) >> 3;
-      switch (len & (8 - 1))
-        {
-        case 0:
-          do
-            {
-              HASHC;
-              /* FALLTHROUGH */
-            case 7:
-              HASHC;
-              /* FALLTHROUGH */
-            case 6:
-              HASHC;
-              /* FALLTHROUGH */
-            case 5:
-              HASHC;
-              /* FALLTHROUGH */
-            case 4:
-              HASHC;
-              /* FALLTHROUGH */
-            case 3:
-              HASHC;
-              /* FALLTHROUGH */
-            case 2:
-              HASHC;
-              /* FALLTHROUGH */
-            case 1:
-              HASHC;
-            }
-	  while (--loop);
-        }
+      switch ((len & (4 - 1)))
+	{
+	case 0:
+	  /* h starts out as zero so no need to include the multiply. */
+	  h = *key++;
+	  /* FALLTHROUGH */
+	case 3:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 2:
+	  HASHC;
+	  /* FALLTHROUGH */
+	case 1:
+	  HASHC;
+	  /* FALLTHROUGH */
+	}
+
+      uint32_t c0, c1, c2, c3;
+      for (--len; len >= 4; len -= 4)
+	{
+	  c0 = (unsigned char) *(key + 0);
+	  c1 = (unsigned char) *(key + 1);
+	  c2 = (unsigned char) *(key + 2);
+	  c3 = (unsigned char) *(key + 3);
+	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
+	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
+
+	  key += 4;
+	}
     }
   return h;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* [PATCH v11 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-05-19 22:18   ` [PATCH v11 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-19 22:18   ` Noah Goldstein
  2022-05-23  7:46     ` Siddhesh Poyarekar
  2022-05-19 22:18   ` [PATCH v11 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
  5 siblings, 1 reply; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:18 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos, Alexander Monakov

Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. The unrolling allows for
pipelined multiplies.

As well, as an optional sysdep, reorder the operations and prevent
reassosiation for better scheduling and higher ILP. This commit
only adds the barrier for x86, although it should be either no
change or a win for any architecture.

Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 benchtests/bench-dl-new-hash.c              |   3 +-
 elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
 elf/tst-dl-hash.c                           |   1 +
 sysdeps/generic/dl-new-hash.h               | 109 ++++++++++++++++++++
 sysdeps/x86/dl-new-hash.h                   |  24 +++++
 5 files changed, 144 insertions(+), 13 deletions(-)
 rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
 create mode 100644 sysdeps/generic/dl-new-hash.h
 create mode 100644 sysdeps/x86/dl-new-hash.h

diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
index 3c8a1d5a82..040fa7ce01 100644
--- a/benchtests/bench-dl-new-hash.c
+++ b/benchtests/bench-dl-new-hash.c
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <elf/dl-new-hash.h>
+#include <dl-new-hash.h>
+#include <elf/simple-dl-new-hash.h>
 #define TEST_FUNC(x, y) _dl_new_hash (x)
 #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
 
diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
similarity index 75%
rename from elf/dl-new-hash.h
rename to elf/simple-dl-new-hash.h
index 8641bb4196..1437b1bd36 100644
--- a/elf/dl-new-hash.h
+++ b/elf/simple-dl-new-hash.h
@@ -1,4 +1,4 @@
-/* _dl_new_hash for elf symbol lookup
+/* __simple_dl_new_hash for testing true elf symbol lookup.
    Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,16 +16,16 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef _DL_NEW_HASH_H
-#define _DL_NEW_HASH_H 1
+#ifndef _SIMPLE_DL_NEW_HASH_H
+#define _SIMPLE_DL_NEW_HASH_H 1
 
 #include <stdint.h>
-/* For __always_inline.  */
-#include <sys/cdefs.h>
 
-static __always_inline uint32_t
+/* For testing/benchmarking purposes.  Real implementation in
+   sysdeps/generic/dl-new-hash.h.  */
+static uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+__simple_dl_new_hash (const char *s)
 {
   uint32_t h = 5381;
   for (unsigned char c = *s; c != '\0'; c = *++s)
@@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
   return h;
 }
 
-/* For testing/benchmarking purposes.  */
-#define __simple_dl_new_hash _dl_new_hash
-
-
-#endif /* dl-new-hash.h */
+#endif /* simple-dl-new-hash.h */
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
index 8697eb73a0..b21766c63d 100644
--- a/elf/tst-dl-hash.c
+++ b/elf/tst-dl-hash.c
@@ -18,6 +18,7 @@
 
 
 #include <simple-dl-hash.h>
+#include <simple-dl-new-hash.h>
 #include <dl-hash.h>
 #include <dl-new-hash.h>
 #include <support/support.h>
diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
new file mode 100644
index 0000000000..59bfb0e1de
--- /dev/null
+++ b/sysdeps/generic/dl-new-hash.h
@@ -0,0 +1,109 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline and __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+   _dl_new_hash (const char *s)
+   {
+      uint32_t h = 5381;
+      for (unsigned char c = *s; c != '\0'; c = *++s)
+        h = h * 33 + c;
+      return h;
+   }
+
+   We can get better performance by slightly unrolling the loop to
+   pipeline the multiples, which gcc cannot easily do due to
+   dependencies across iterations.
+
+   As well, as an architecture specific option we add asm statements
+   to explicitly specify order of operations and prevent reassociation
+   of instructions that lengthens the loop carried dependency. This
+   may have no affect as the compiler may have ordered instructions
+   the same way without it but in testing this has not been the case
+   for GCC. Improving GCC to reliably schedule instructions ideally
+   cannot be easily done.
+
+   Architecture(s) that use the reassociation barriers are:
+   x86
+
+   Note it is very unlikely the reassociation barriers would
+   de-optimize performance on any architecture and with an imperfect
+   compiler it may help performance, especially on out-of-order cpus,
+   so it is suggested that the respective maintainers add them.
+
+   Architecture maintainers are encouraged to benchmark this with
+   __asm_reassociation_barrier defined to __asm__ like it is in x86.
+*/
+
+
+#ifndef __asm_reassociation_barrier
+# define __asm_reassociation_barrier(...)
+#endif
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *str)
+{
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = s[0];
+      /* Since hashed string is normally not empty, this is unlikely on the
+	 first iteration of the loop.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = s[1];
+      if (c1 == 0)
+	{
+	  /* Ideal computational order is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;  */
+	  c0 += h;
+	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal computational order is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;  */
+      c1 += c0;
+      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      __asm_reassociation_barrier("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
+}
+
+#endif /* dl-new-hash.h */
diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
new file mode 100644
index 0000000000..ce8fb5a838
--- /dev/null
+++ b/sysdeps/x86/dl-new-hash.h
@@ -0,0 +1,24 @@
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef __asm_reassociation_barrier
+# error "__asm_reassociation_barrier should never already be defined."
+#endif
+
+#define __asm_reassociation_barrier __asm__
+#include <sysdeps/generic/dl-new-hash.h>
-- 
2.34.1


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v11 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked
  2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-05-19 22:18   ` [PATCH v11 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-19 22:18   ` Noah Goldstein
  5 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:18 UTC (permalink / raw)
  To: GNU C Library; +Cc: H.J. Lu, Carlos O'Donell, Siddhesh Poyarekar

On Thu, May 19, 2022 at 5:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No change to the code other than moving the function to
> dl-new-hash.h. Changed name so its now in the reserved namespace.
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

No changes v10 -> v11 so left reviewed by.
> ---
>  elf/dl-lookup.c   | 13 ++-----------
>  elf/dl-new-hash.h | 40 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 42 insertions(+), 11 deletions(-)
>  create mode 100644 elf/dl-new-hash.h
>
> diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
> index 989b073e4f..a42f6d5390 100644
> --- a/elf/dl-lookup.c
> +++ b/elf/dl-lookup.c
> @@ -24,6 +24,7 @@
>  #include <ldsodefs.h>
>  #include <dl-hash.h>
>  #include <dl-machine.h>
> +#include <dl-new-hash.h>
>  #include <dl-protected.h>
>  #include <sysdep-cancel.h>
>  #include <libc-lock.h>
> @@ -558,16 +559,6 @@ skip:
>  }
>
>
> -static uint32_t
> -dl_new_hash (const char *s)
> -{
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> -}
> -
> -
>  /* Add extra dependency on MAP to UNDEF_MAP.  */
>  static int
>  add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
> @@ -816,7 +807,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map,
>                      const struct r_found_version *version,
>                      int type_class, int flags, struct link_map *skip_map)
>  {
> -  const unsigned int new_hash = dl_new_hash (undef_name);
> +  const unsigned int new_hash = _dl_new_hash (undef_name);
>    unsigned long int old_hash = 0xffffffff;
>    struct sym_val current_value = { NULL, NULL };
>    struct r_scope_elem **scope = symbol_scope;
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> new file mode 100644
> index 0000000000..8641bb4196
> --- /dev/null
> +++ b/elf/dl-new-hash.h
> @@ -0,0 +1,40 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *s)
> +{
> +  uint32_t h = 5381;
> +  for (unsigned char c = *s; c != '\0'; c = *++s)
> +    h = h * 33 + c;
> +  return h;
> +}
> +
> +/* For testing/benchmarking purposes.  */
> +#define __simple_dl_new_hash _dl_new_hash
> +
> +
> +#endif /* dl-new-hash.h */
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash)
  2022-05-19 22:17   ` [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
@ 2022-05-19 22:19     ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:19 UTC (permalink / raw)
  To: GNU C Library; +Cc: H.J. Lu, Carlos O'Donell, Siddhesh Poyarekar

On Thu, May 19, 2022 at 5:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> If we want to further optimize the functions tests are needed.
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>


No changes v10 -> v11 so left reviewed by.
> ---
>  elf/Makefile         |   1 +
>  elf/simple-dl-hash.h |  42 ++++++++++++++++
>  elf/tst-dl-hash.c    | 115 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 158 insertions(+)
>  create mode 100644 elf/simple-dl-hash.h
>  create mode 100644 elf/tst-dl-hash.c
>
> diff --git a/elf/Makefile b/elf/Makefile
> index 6f4ea78007..838fe39afb 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -312,6 +312,7 @@ tests := \
>    tst-array4 \
>    tst-array5 \
>    tst-auxv \
> +  tst-dl-hash \
>    tst-leaks1 \
>    tst-stringtable \
>    tst-tls9 \
> diff --git a/elf/simple-dl-hash.h b/elf/simple-dl-hash.h
> new file mode 100644
> index 0000000000..53702b3c55
> --- /dev/null
> +++ b/elf/simple-dl-hash.h
> @@ -0,0 +1,42 @@
> +/* __simple_dl_elf_hash for testing true elf symbol lookup.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SIMPLE_DL_ELF_HASH_H
> +#define _SIMPLE_DL_ELF_HASH_H 1
> +
> +#include <stdint.h>
> +
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-hash.h.  */
> +static uint32_t
> +__attribute__ ((unused))
> +__simple_dl_elf_hash (const char *name_arg)
> +{
> +  unsigned long int hash = 0;
> +  for (unsigned char c = *name_arg; c != '\0'; c = *(++name_arg))
> +    {
> +      unsigned long int hi;
> +      hash = (hash << 4) + c;
> +      hi = hash & 0xf0000000;
> +      hash ^= hi >> 24;
> +      hash &= 0x0fffffff;
> +    }
> +  return hash;
> +}
> +
> +#endif /* simple-dl-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> new file mode 100644
> index 0000000000..8697eb73a0
> --- /dev/null
> +++ b/elf/tst-dl-hash.c
> @@ -0,0 +1,115 @@
> +/* Test dl-hash functions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <simple-dl-hash.h>
> +#include <dl-hash.h>
> +#include <dl-new-hash.h>
> +#include <support/support.h>
> +#include <support/check.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +typedef unsigned int (*hash_f) (const char *);
> +
> +
> +
> +static int
> +do_fill_test (size_t len, int fill, const char *name, hash_f testf,
> +             hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  char buf[len + 1];
> +  memset (buf, fill, len);
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: fill(%d) %s(%zu), %x != %x\n", fill, name, len, expec,
> +               res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  if (do_fill_test (len, fill, "dl_new_hash", &_dl_new_hash,
> +                   &__simple_dl_new_hash))
> +    return 1;
> +
> +  return do_fill_test (len, fill, "dl_elf_hash", &_dl_elf_hash,
> +                      &__simple_dl_elf_hash);
> +}
> +
> +static int
> +do_rand_test (size_t len, const char *name, hash_f testf, hash_f expecf)
> +{
> +  uint32_t expec, res;
> +  size_t i;
> +  char buf[len + 1];
> +  char v;
> +  for (i = 0; i < len; ++i)
> +    {
> +      v = random ();
> +      if (v == 0)
> +       v = 1;
> +
> +      buf[i] = v;
> +    }
> +  buf[len] = '\0';
> +
> +  expec = expecf (buf);
> +  res = testf (buf);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: random %s(%zu), %x != %x\n", name, len, expec, res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_rand_tests (size_t len)
> +{
> +  if (do_rand_test (len, "dl_new_hash", &_dl_new_hash, &__simple_dl_new_hash))
> +    return 1;
> +
> +  return do_rand_test (len, "dl_elf_hash", &_dl_elf_hash, &__simple_dl_elf_hash);
> +}
> +
> +static int
> +do_test (void)
> +{
> +  size_t i, j;
> +  for (i = 0; i < 100; ++i)
> +    {
> +      for (j = 0; j < 8192; ++j)
> +       {
> +         if (do_rand_tests (i))
> +           return 1;
> +
> +         if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> +             || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> +           return 1;
> +       }
> +    }
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-19 15:40       ` Siddhesh Poyarekar
@ 2022-05-19 22:20         ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:20 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Thu, May 19, 2022 at 10:40 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 19/05/2022 20:39, Siddhesh Poyarekar wrote:
> > On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> >> If we want to further optimize the function tests are needed.
> >> ---
> >>   nss/Makefile          |  1 +
> >>   nss/nss_hash.c        | 16 +++++++++
> >>   nss/simple-nss-hash.h | 42 +++++++++++++++++++++++
> >>   nss/tst-nss-hash.c    | 80 +++++++++++++++++++++++++++++++++++++++++++
> >>   4 files changed, 139 insertions(+)
> >>   create mode 100644 nss/simple-nss-hash.h
> >>   create mode 100644 nss/tst-nss-hash.c
> >
> > LGTM.
> >
> > Reviewed-by: Siddhesh Poyarekar
> >
> >>
> >> diff --git a/nss/Makefile b/nss/Makefile
> >> index d8b06b44fb..a978e3927a 100644
> >> --- a/nss/Makefile
> >> +++ b/nss/Makefile
> >> @@ -62,6 +62,7 @@ tests := \
> >>     test-digits-dots \
> >>     test-netdb \
> >>     tst-nss-getpwent \
> >> +  tst-nss-hash \
> >>     tst-nss-test1 \
> >>     tst-nss-test2 \
> >>     tst-nss-test4 \
> >> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> >> index 27a348ea9b..f9e17d068a 100644
> >> --- a/nss/nss_hash.c
> >> +++ b/nss/nss_hash.c
> >> @@ -75,4 +75,20 @@ __nss_hash (const void *keyarg, size_t len)
> >>     return h;
> >>   }
> >> +/* For testing/benchmarking purposes. */
> >> +static uint32_t
> >> +__simple_nss_hash (const void *keyarg, size_t len)
> >> +{
> >> +  const unsigned char *key;
> >> +  size_t i;
> >> +  uint32_t h = 0;
> >> +  key = keyarg;
> >> +
> >> +  for (i = 0; i < len; ++i)
> >> +    h = *key++ + 65599 * h;
> >> +
> >> +  return h;
> >> +}
> >> +
> >> +
>
> It just struck me (while reviewing 5/6) that this is duplicated in
> simple-nss-hash.h below.  Shouldn't it be one or the other?  I know it's
> "fixed" in 5/6 but it would be nice to restructure things so that the
> tree builds at this point of the patchset too.

Fixed V11. Forgot to fix in my last rebases :(
>
> >>   libc_hidden_def (__nss_hash)
> >> diff --git a/nss/simple-nss-hash.h b/nss/simple-nss-hash.h
> >> new file mode 100644
> >> index 0000000000..47708972e7
> >> --- /dev/null
> >> +++ b/nss/simple-nss-hash.h
> >> @@ -0,0 +1,42 @@
> >> +/* __simple_nss_hash for testing nss_hash function
> >> +   Copyright (C) 2022 Free Software Foundation, Inc.
> >> +   This file is part of the GNU C Library.
> >> +
> >> +   The GNU C Library is free software; you can redistribute it and/or
> >> +   modify it under the terms of the GNU Lesser General Public
> >> +   License as published by the Free Software Foundation; either
> >> +   version 2.1 of the License, or (at your option) any later version.
> >> +
> >> +   The GNU C Library is distributed in the hope that it will be useful,
> >> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> +   Lesser General Public License for more details.
> >> +
> >> +   You should have received a copy of the GNU Lesser General Public
> >> +   License along with the GNU C Library; if not, see
> >> +   <https://www.gnu.org/licenses/>.  */
> >> +
> >> +#ifndef _SIMPLE_NSS_HASH_H
> >> +#define _SIMPLE_NSS_HASH_H 1
> >> +
> >> +#include <stdint.h>
> >> +
> >> +/* For testing/benchmarking purposes.  Real implementation in
> >> +   nss/nss_hash.c.  */
> >> +static uint32_t
> >> +__attribute__ ((unused))
> >> +__simple_nss_hash (const void *keyarg, size_t len)
> >> +{
> >> +  const unsigned char *key;
> >> +  size_t i;
> >> +  uint32_t h = 0;
> >> +  key = keyarg;
> >> +
> >> +  for (i = 0; i < len; ++i)
> >> +    h = *key++ + 65599 * h;
> >> +
> >> +  return h;
> >> +}
> >> +
> >> +
> >> +#endif /* simple-nss-hash.h */
> >> diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
> >> new file mode 100644
> >> index 0000000000..5ec1f9b0c5
> >> --- /dev/null
> >> +++ b/nss/tst-nss-hash.c
> >> @@ -0,0 +1,80 @@
> >> +/* Test __nss_hash
> >> +   Copyright (C) 2022 Free Software Foundation, Inc.
> >> +   This file is part of the GNU C Library.
> >> +
> >> +   The GNU C Library is free software; you can redistribute it and/or
> >> +   modify it under the terms of the GNU Lesser General Public
> >> +   License as published by the Free Software Foundation; either
> >> +   version 2.1 of the License, or (at your option) any later version.
> >> +
> >> +   The GNU C Library is distributed in the hope that it will be useful,
> >> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> +   Lesser General Public License for more details.
> >> +
> >> +   You should have received a copy of the GNU Lesser General Public
> >> +   License along with the GNU C Library; if not, see
> >> +   <https://www.gnu.org/licenses/>.  */
> >> +
> >> +#include <support/support.h>
> >> +#include <support/check.h>
> >> +#include <stdio.h>
> >> +#include <string.h>
> >> +#include <stdlib.h>
> >> +#include <nss.h>
> >> +#include <simple-nss-hash.h>
> >> +
> >> +uint32_t __nss_hash (const void *__key, size_t __length);
> >> +
> >> +static int
> >> +do_fill_tests (size_t len, int fill)
> >> +{
> >> +  uint32_t expec, res;
> >> +  char buf[len];
> >> +  memset (buf, fill, len);
> >> +
> >> +  expec = __simple_nss_hash (buf, len);
> >> +  res = __nss_hash (buf, len);
> >> +  if (expec != res)
> >> +    FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec,
> >> res);
> >> +
> >> +  return 0;
> >> +}
> >> +
> >> +static int
> >> +do_rand_tests (size_t len)
> >> +{
> >> +  uint32_t expec, res;
> >> +  size_t i;
> >> +  char buf[len];
> >> +  for (i = 0; i < len; ++i)
> >> +    buf[i] = random ();
> >> +
> >> +  expec = __simple_nss_hash (buf, len);
> >> +  res = __nss_hash (buf, len);
> >> +  if (expec != res)
> >> +    FAIL_EXIT1 ("FAIL: random (%zu), %x != %x\n", len, expec, res);
> >> +
> >> +  return 0;
> >> +}
> >> +
> >> +static int
> >> +do_test (void)
> >> +{
> >> +  size_t i, j;
> >> +  for (i = 0; i < 100; ++i)
> >> +    {
> >> +      for (j = 0; j < 8192; ++j)
> >> +    {
> >> +      if (do_rand_tests (i))
> >> +        return 1;
> >> +
> >> +      if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> >> +          || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> >> +        return 1;
> >> +    }
> >> +    }
> >> +  return 0;
> >> +}
> >> +
> >> +#include <support/test-driver.c>
> >
>

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-19 15:34     ` Siddhesh Poyarekar
@ 2022-05-19 22:20       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:20 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Thu, May 19, 2022 at 10:34 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> > Benchtests are for throughput and include random / fixed size
> > benchmarks.
> > ---
> >   benchtests/Makefile                  |  25 ++++-
> >   benchtests/README                    |   9 +-
> >   benchtests/bench-dl-elf-hash.c       |  27 +++++
> >   benchtests/bench-dl-new-hash.c       |  25 +++++
> >   benchtests/bench-hash-funcs-kernel.h |  92 ++++++++++++++++
> >   benchtests/bench-hash-funcs.c        | 152 +++++++++++++++++++++++++++
> >   benchtests/bench-nss-hash.c          |  26 +++++
> >   7 files changed, 348 insertions(+), 8 deletions(-)
> >   create mode 100644 benchtests/bench-dl-elf-hash.c
> >   create mode 100644 benchtests/bench-dl-new-hash.c
> >   create mode 100644 benchtests/bench-hash-funcs-kernel.h
> >   create mode 100644 benchtests/bench-hash-funcs.c
> >   create mode 100644 benchtests/bench-nss-hash.c
> >
> > diff --git a/benchtests/Makefile b/benchtests/Makefile
> > index de9de5cf58..c279041e19 100644
> > --- a/benchtests/Makefile
> > +++ b/benchtests/Makefile
> > @@ -227,6 +227,12 @@ LOCALES := \
> >   include ../gen-locales.mk
> >   endif
> >
> > +hash-benchset := \
> > +  dl-elf-hash \
> > +  dl-new-hash \
> > +  nss-hash \
> > +# hash-benchset
> > +
> >   stdlib-benchset := strtod
> >
> >   stdio-common-benchset := sprintf
> > @@ -235,7 +241,7 @@ math-benchset := math-inlines
> >
> >   ifeq (${BENCHSET},)
> >   benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
> > -         $(math-benchset)
> > +         $(math-benchset) $(hash-benchset)
> >   else
> >   benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
> >   endif
> > @@ -363,9 +369,20 @@ bench-clean:
> >
> >   # Validate the passed in BENCHSET
> >   ifneq ($(strip ${BENCHSET}),)
> > -VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
> > -   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
> > -   malloc-thread malloc-simple
> > +VALIDBENCHSETNAMES := \
> > +  bench-math \
> > +  bench-pthread \
> > +  bench-string \
> > +  hash-benchset \
> > +  malloc-simple \
> > +  malloc-thread \
> > +  math-benchset \
> > +  stdio-common-benchset \
> > +  stdlib-benchset \
> > +  string-benchset \
> > +  wcsmbs-benchset \
> > +# VALIDBENCHSETNAMES
> > +
> >   INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
> >   ifneq (${INVALIDBENCHSETNAMES},)
> >   $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
> > diff --git a/benchtests/README b/benchtests/README
> > index 4d83a05b4b..998ba9b2b4 100644
> > --- a/benchtests/README
> > +++ b/benchtests/README
> > @@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
> >       bench-math
> >       bench-pthread
> >       bench-string
> > +    hash-benchset
> > +    malloc-thread
> > +    math-benchset
> > +    stdio-common-benchset
> > +    stdlib-benchset
> >       string-benchset
> >       wcsmbs-benchset
> > -    stdlib-benchset
> > -    stdio-common-benchset
> > -    math-benchset
> > -    malloc-thread
> >
> >   Adding a function to benchtests:
> >   ===============================
> > diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
> > new file mode 100644
> > index 0000000000..067de9fca4
> > --- /dev/null
> > +++ b/benchtests/bench-dl-elf-hash.c
> > @@ -0,0 +1,27 @@
> > +/* Measure __dl_new_hash runtime
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <dl-hash.h>
> > +#include <elf/simple-dl-hash.h>
> > +#define TEST_FUNC(x, y) _dl_elf_hash (x)
> > +#define SIMPLE_TEST_FUNC(x, y) __simple_dl_elf_hash (x)
> > +
> > +#define TEST_NAME "_dl_elf_hash"
> > +
> > +
> > +#include "bench-hash-funcs.c"
>
> OK.
>
> > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> > new file mode 100644
> > index 0000000000..3c8a1d5a82
> > --- /dev/null
> > +++ b/benchtests/bench-dl-new-hash.c
> > @@ -0,0 +1,25 @@
> > +/* Measure __dl_new_hash runtime
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <elf/dl-new-hash.h>
> > +#define TEST_FUNC(x, y) _dl_new_hash (x)
> > +#define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
> > +
> > +#define TEST_NAME "_dl_new_hash"
> > +
> > +#include "bench-hash-funcs.c"
>
> OK.
>
> > diff --git a/benchtests/bench-hash-funcs-kernel.h b/benchtests/bench-hash-funcs-kernel.h
> > new file mode 100644
> > index 0000000000..9f9f245641
> > --- /dev/null
> > +++ b/benchtests/bench-hash-funcs-kernel.h
> > @@ -0,0 +1,92 @@
> > +/* Actual benchmark kernels used by bench-hash-funcs.h
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +
> > +/* We go through the trouble of using macros here because many of the
> > +   hash functions are meant to be inlined so its not fair to benchmark
> > +   them with a function pointer where they won't be inlinable. */
> > +#undef RUN_FUNC
> > +#undef POSTFIX
> > +#ifdef SIMPLE
> > +# define RUN_FUNC SIMPLE_TEST_FUNC
> > +# define POSTFIX _simple
> > +#else
> > +# define RUN_FUNC TEST_FUNC
> > +# define POSTFIX _optimized
> > +#endif
> > +
> > +#define PRIMITIVE_CAT(x, y) x ## y
> > +#define CAT(x, y) PRIMITIVE_CAT (x, y)
> > +
> > +static double __attribute__ ((noinline, noclone))
> > +CAT (do_one_test_kernel, POSTFIX) (const char *s, size_t len)
> > +{
> > +
> > +  unsigned int iters;
> > +  timing_t start, stop, cur;
> > +
> > +  /* Warmup.  */
> > +  for (iters = NFIXED_ITERS / 32; iters; --iters)
> > +    {
>
> Redundant braces here and elsewhere below.

Fixed in V11 here and everywhere else I saw.
>
> > +      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
> > +    }
> > +
> > +  TIMING_NOW (start);
> > +  for (iters = NFIXED_ITERS; iters; --iters)
> > +    {
> > +      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  (void) (len);
> > +  return (double) cur / (double) NFIXED_ITERS;
> > +}
> > +
> > +static double __attribute__ ((noinline, noclone))
> > +CAT (do_rand_test_kernel, POSTFIX) (char const *bufs,
> > +                                 unsigned int const *sizes)
> > +{
> > +  unsigned int i, iters;
> > +  size_t offset;
> > +  timing_t start, stop, cur;
> > +
> > +  /* Warmup.  */
> > +  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
> > +    {
> > +      DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
> > +    }
> > +
> > +  TIMING_NOW (start);
> > +  for (iters = NRAND_ITERS; iters; --iters)
> > +    {
> > +      for (i = 0, offset = 0; i < NRAND_BUFS;
> > +        ++i, offset += RAND_BENCH_MAX_LEN)
> > +     {
> > +       DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
> > +     }
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  (void) (sizes);
> > +  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
> > +}
> > diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
> > new file mode 100644
> > index 0000000000..3d3c736ffc
> > --- /dev/null
> > +++ b/benchtests/bench-hash-funcs.c
> > @@ -0,0 +1,152 @@
> > +/* Measure hash functions runtime.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#define TEST_MAIN
> > +#ifndef TEST_FUNC
> > +# error "No TEST_FUNC provided!"
> > +#endif
> > +#ifndef SIMPLE_TEST_FUNC
> > +# error "No SIMPLE_TEST_FUNC provided!"
> > +#endif
> > +
> > +#ifndef TEST_NAME
> > +# define STRINGIFY_PRIMITIVE(x) #  x
> > +# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
> > +
> > +# define TEST_NAME STRINGIFY (TEST_FUNC)
> > +#endif
> > +
> > +#include "json-lib.h"
> > +#include "bench-timing.h"
> > +
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +
> > +#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
> > +
> > +enum
> > +{
> > +  NFIXED_ITERS = 1048576,
> > +  NRAND_BUFS = 16384,
> > +  NRAND_ITERS = 2048,
> > +  RAND_BENCH_MAX_LEN = 128
> > +};
> > +
> > +#include "bench-hash-funcs-kernel.h"
> > +#define SIMPLE
> > +#include "bench-hash-funcs-kernel.h"
> > +
> > +static void
> > +do_one_test (json_ctx_t *json_ctx, size_t len)
> > +{
> > +  char buf[len + 1];
> > +  memset (buf, -1, len);
> > +  buf[len] = '\0';
> > +
> > +  json_element_object_begin (json_ctx);
> > +
> > +  json_attr_string (json_ctx, "type", "fixed");
> > +  json_attr_uint (json_ctx, "length", len);
> > +  json_attr_double (json_ctx, "time_simple", do_one_test_kernel_simple (buf, len));
> > +  json_attr_double (json_ctx, "time_optimized", do_one_test_kernel_optimized (buf, len));
> > +
> > +  json_element_object_end (json_ctx);
> > +}
> > +
> > +static void __attribute__ ((noinline, noclone))
> > +do_rand_test (json_ctx_t *json_ctx)
> > +{
> > +  size_t i, sz, offset;
> > +  char *bufs;
> > +  unsigned int *sizes;
> > +
> > +  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
> > +  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
> > +  if (bufs == NULL || sizes == NULL)
> > +    {
> > +      fprintf (stderr, "Failed to allocate bufs for random test\n");
> > +      goto done;
> > +    }
> > +
> > +  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
> > +    {
> > +      json_element_object_begin (json_ctx);
> > +      json_attr_string (json_ctx, "type", "random");
> > +      json_attr_uint (json_ctx, "length", sz);
> > +
> > +      for (i = 0, offset = 0; i < NRAND_BUFS;
> > +        ++i, offset += RAND_BENCH_MAX_LEN)
> > +     {
> > +       sizes[i] = random () % sz;
> > +       memset (bufs + offset, -1, sizes[i]);
> > +       bufs[offset + sizes[i]] = '\0';
> > +     }
> > +
> > +      json_attr_double (json_ctx, "time_simple",
> > +                     do_rand_test_kernel_simple (bufs, sizes));
> > +      json_attr_double (json_ctx, "time_optimized",
> > +                     do_rand_test_kernel_optimized (bufs, sizes));
> > +      json_element_object_end (json_ctx);
> > +    }
> > +
> > +done:
> > +  if (bufs)
> > +    {
>
> Redundant braces here and multiple places in this file.

Fixed in V11 here and everywhere else I saw.
>
> > +      free (bufs);
> > +    }
> > +  if (sizes)
> > +    {
> > +      free (sizes);
> > +    }
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  int i;
> > +  json_ctx_t json_ctx;
> > +
> > +  json_init (&json_ctx, 0, stdout);
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_array_begin (&json_ctx, "results");
> > +
> > +  for (i = 0; i < 16; ++i)
> > +    {
> > +      do_one_test (&json_ctx, i);
> > +    }
> > +
> > +  for (i = 16; i <= 256; i += i)
> > +    {
> > +      do_one_test (&json_ctx, i);
> > +    }
> > +
> > +  do_rand_test (&json_ctx);
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> > +  return 0;
> > +}
> > +
> > +#include <support/test-driver.c>
> > diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
> > new file mode 100644
> > index 0000000000..7e369428a2
> > --- /dev/null
> > +++ b/benchtests/bench-nss-hash.c
> > @@ -0,0 +1,26 @@
> > +/* Measure __nss_hash runtime
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <nss.h>
> > +#include <nss/simple-nss-hash.h>
> > +#define TEST_FUNC __nss_hash
> > +#define SIMPLE_TEST_FUNC __simple_nss_hash
> > +
> > +uint32_t __nss_hash (const void *__key, size_t __length);
> > +
> > +#include "bench-hash-funcs.c"
>
> OK.
>
> So just the redundant braces to fix and you should be OK.  Please send
> V11 with the change.
>
> Thanks,
> Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-19 15:41     ` Siddhesh Poyarekar
@ 2022-05-19 22:21       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:21 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library

On Thu, May 19, 2022 at 10:41 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> > The prior unrolling didn't really do much as it left the dependency
> > chain between iterations. Unrolled the loop for 4 so 4x multiplies
> > could be pipelined in out-of-order machines.
> >
> > Results for __nss_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=25 runs
> > Geometric of all benchmark New / Old: 0.845
> >    type, length, New Time, Old Time, New Time / Old Time
> >   fixed,      0,    4.019,    3.729,               1.078
> >   fixed,      1,     4.95,    5.707,               0.867
> >   fixed,      2,    5.152,    5.657,               0.911
> >   fixed,      3,    4.641,    5.721,               0.811
> >   fixed,      4,    5.551,     5.81,               0.955
> >   fixed,      5,    6.525,    6.552,               0.996
> >   fixed,      6,    6.711,    6.561,               1.023
> >   fixed,      7,    6.715,    6.767,               0.992
> >   fixed,      8,    7.874,    7.915,               0.995
> >   fixed,      9,    8.888,    9.767,                0.91
> >   fixed,     10,    8.959,    9.762,               0.918
> >   fixed,     11,    9.188,    9.987,                0.92
> >   fixed,     12,    9.708,   10.618,               0.914
> >   fixed,     13,   10.393,    11.14,               0.933
> >   fixed,     14,   10.628,   12.097,               0.879
> >   fixed,     15,   10.982,   12.965,               0.847
> >   fixed,     16,   11.851,   14.429,               0.821
> >   fixed,     32,   24.334,   34.414,               0.707
> >   fixed,     64,   55.618,   86.688,               0.642
> >   fixed,    128,  118.261,   224.36,               0.527
> >   fixed,    256,  256.183,  538.629,               0.476
> > random,      2,   11.194,   11.556,               0.969
> > random,      4,   17.516,   17.205,               1.018
> > random,      8,   23.501,   20.985,                1.12
> > random,     16,   28.131,   29.212,               0.963
> > random,     32,   35.436,   38.662,               0.917
> > random,     64,    45.74,   58.868,               0.777
> > random,    128,   75.394,  121.963,               0.618
> > random,    256,  139.524,  260.726,               0.535
> > ---
> >   nss/nss_hash.c | 92 ++++++++++++++++++++++----------------------------
> >   1 file changed, 41 insertions(+), 51 deletions(-)
> >
> > diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> > index f9e17d068a..1d3787e675 100644
> > --- a/nss/nss_hash.c
> > +++ b/nss/nss_hash.c
> > @@ -19,74 +19,64 @@
> >
> >   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
> >   /*
> > - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> > + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
> >    * units.  On the first time through the loop we get the "leftover bytes"
> > - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> > - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> > - * this routine is heavily used enough, it's worth the ugly coding.
> > + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> > + * HASHC. Further unrolling does not appear to help.
> >    *
> >    * OZ's original sdbm hash
> >    */
> >   uint32_t
> >   __nss_hash (const void *keyarg, size_t len)
> >   {
> > +  enum
> > +  {
> > +    HASH_CONST_P0 = 1,              /* (uint32_t)(65599 ^ 0).  */
> > +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> > +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> > +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> > +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> > +  };
> > +
> >     const unsigned char *key;
> > -  size_t loop;
> >     uint32_t h;
> >
> > -#define HASHC   h = *key++ + 65599 * h
> > +#define HASHC        h = *key++ + HASH_CONST_P1 * h
> >
> >     h = 0;
> >     key = keyarg;
> >     if (len > 0)
> >       {
> > -      loop = (len + 8 - 1) >> 3;
> > -      switch (len & (8 - 1))
> > -        {
> > -        case 0:
> > -          do
> > -            {
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 7:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 6:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 5:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 4:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 3:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 2:
> > -              HASHC;
> > -              /* FALLTHROUGH */
> > -            case 1:
> > -              HASHC;
> > -            }
> > -       while (--loop);
> > -        }
> > -    }
> > -  return h;
> > -}
> > +      switch ((len & (4 - 1)))
> > +     {
> > +     case 0:
> > +       /* h starts out as zero so no need to include the multiply. */
> > +       h = *key++;
> > +       /* FALLTHROUGH */
> > +     case 3:
> > +       HASHC;
> > +       /* FALLTHROUGH */
> > +     case 2:
> > +       HASHC;
> > +       /* FALLTHROUGH */
> > +     case 1:
> > +       HASHC;
> > +       /* FALLTHROUGH */
> > +     }
> >
> > -/* For testing/benchmarking purposes. */
> > -static uint32_t
> > -__simple_nss_hash (const void *keyarg, size_t len)
> > -{
> > -  const unsigned char *key;
> > -  size_t i;
> > -  uint32_t h = 0;
> > -  key = keyarg;
> > -
> > -  for (i = 0; i < len; ++i)
> > -    h = *key++ + 65599 * h;
> > +      uint32_t c0, c1, c2, c3;
> > +      for (--len; len >= 4; len -= 4)
> > +     {
> > +       c0 = (unsigned char) *(key + 0);
> > +       c1 = (unsigned char) *(key + 1);
> > +       c2 = (unsigned char) *(key + 2);
> > +       c3 = (unsigned char) *(key + 3);
> > +       h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> > +           + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> >
> > +       key += 4;
> > +     }
> > +    }
> >     return h;
> >   }
> >
>
> This bottom bit should get dropped in 3/6 (and not have
> __simple_nss_hash in two places) and then reintroduced here.

Fixed in V11.
>
> Thanks,
> Siddhesh

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-19 15:55     ` Siddhesh Poyarekar
@ 2022-05-19 22:22       ` Noah Goldstein
  0 siblings, 0 replies; 167+ messages in thread
From: Noah Goldstein @ 2022-05-19 22:22 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: GNU C Library, Alexander Monakov

On Thu, May 19, 2022 at 10:55 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. The unrolling allows for
> > pipelined multiplies.
> >
> > As well, as an optional sysdep, reorder the operations and prevent
> > reassosiation for better scheduling and higher ILP. This commit
> > only adds the barrier for x86, although it should be either no
> > change or a win for any architecture.
> >
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >    type, length, New Time, Old Time, New Time / Old Time
> >   fixed,      0,    2.865,     2.72,               1.053
> >   fixed,      1,    3.567,    2.489,               1.433
> >   fixed,      2,    2.577,    3.649,               0.706
> >   fixed,      3,    3.644,    5.983,               0.609
> >   fixed,      4,    4.211,    6.833,               0.616
> >   fixed,      5,    4.741,    9.372,               0.506
> >   fixed,      6,    5.415,    9.561,               0.566
> >   fixed,      7,    6.649,   10.789,               0.616
> >   fixed,      8,    8.081,   11.808,               0.684
> >   fixed,      9,    8.427,   12.935,               0.651
> >   fixed,     10,    8.673,   14.134,               0.614
> >   fixed,     11,    10.69,   15.408,               0.694
> >   fixed,     12,   10.789,   16.982,               0.635
> >   fixed,     13,   12.169,   18.411,               0.661
> >   fixed,     14,   12.659,   19.914,               0.636
> >   fixed,     15,   13.526,   21.541,               0.628
> >   fixed,     16,   14.211,   23.088,               0.616
> >   fixed,     32,   29.412,   52.722,               0.558
> >   fixed,     64,    65.41,  142.351,               0.459
> >   fixed,    128,  138.505,  295.625,               0.469
> >   fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >   benchtests/bench-dl-new-hash.c              |   3 +-
> >   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
> >   elf/tst-dl-hash.c                           |   1 +
> >   sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
> >   sysdeps/x86/dl-new-hash.h                   |  24 +++++
> >   5 files changed, 146 insertions(+), 13 deletions(-)
> >   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
> >   create mode 100644 sysdeps/generic/dl-new-hash.h
> >   create mode 100644 sysdeps/x86/dl-new-hash.h
>
> Mostly OK, just minor nits to fix below.
>
> >
> > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> > index 3c8a1d5a82..040fa7ce01 100644
> > --- a/benchtests/bench-dl-new-hash.c
> > +++ b/benchtests/bench-dl-new-hash.c
> > @@ -16,7 +16,8 @@
> >      License along with the GNU C Library; if not, see
> >      <https://www.gnu.org/licenses/>.  */
> >
> > -#include <elf/dl-new-hash.h>
> > +#include <dl-new-hash.h>
> > +#include <elf/simple-dl-new-hash.h>
> >   #define TEST_FUNC(x, y) _dl_new_hash (x)
> >   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>
> OK.
>
> >
> > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> > similarity index 75%
> > rename from elf/dl-new-hash.h
> > rename to elf/simple-dl-new-hash.h
> > index 8641bb4196..1437b1bd36 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/simple-dl-new-hash.h
> > @@ -1,4 +1,4 @@
> > -/* _dl_new_hash for elf symbol lookup
> > +/* __simple_dl_new_hash for testing true elf symbol lookup.
> >      Copyright (C) 2022 Free Software Foundation, Inc.
> >      This file is part of the GNU C Library.
> >
> > @@ -16,16 +16,16 @@
> >      License along with the GNU C Library; if not, see
> >      <https://www.gnu.org/licenses/>.  */
> >
> > -#ifndef _DL_NEW_HASH_H
> > -#define _DL_NEW_HASH_H 1
> > +#ifndef _SIMPLE_DL_NEW_HASH_H
> > +#define _SIMPLE_DL_NEW_HASH_H 1
> >
> >   #include <stdint.h>
> > -/* For __always_inline.  */
> > -#include <sys/cdefs.h>
> >
> > -static __always_inline uint32_t
> > +/* For testing/benchmarking purposes.  Real implementation in
> > +   sysdeps/generic/dl-new-hash.h.  */
> > +static uint32_t
> >   __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +__simple_dl_new_hash (const char *s)
> >   {
> >     uint32_t h = 5381;
> >     for (unsigned char c = *s; c != '\0'; c = *++s)
> > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
> >     return h;
> >   }
> >
> > -/* For testing/benchmarking purposes.  */
> > -#define __simple_dl_new_hash _dl_new_hash
> > -
> > -
> > -#endif /* dl-new-hash.h */
> > +#endif /* simple-dl-new-hash.h */
> > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> > index 8697eb73a0..b21766c63d 100644
> > --- a/elf/tst-dl-hash.c
> > +++ b/elf/tst-dl-hash.c
> > @@ -18,6 +18,7 @@
> >
> >
> >   #include <simple-dl-hash.h>
> > +#include <simple-dl-new-hash.h>
> >   #include <dl-hash.h>
> >   #include <dl-new-hash.h>
> >   #include <support/support.h>
> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..1faf309c97
> > --- /dev/null
> > +++ b/sysdeps/generic/dl-new-hash.h
> > @@ -0,0 +1,111 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
>
> Duplicate, but you already know this.
>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +   _dl_new_hash (const char *s)
> > +   {
> > +      uint32_t h = 5381;
> > +      for (unsigned char c = *s; c != '\0'; c = *++s)
> > +        h = h * 33 + c;
> > +      return h;
> > +   }
> > +
> > +   We can get better performance by slightly unrolling the loop to
> > +   pipeline the multiples, which gcc cannot easily do due to
> > +   dependencies across iterations.
> > +
> > +   As well, as an architecture specific option we add asm statements
> > +   to explicitly specify order of operations and prevent reassociation
> > +   of instructions that lengthens the loop carried dependency. This
> > +   may have no affect as the compiler may have ordered instructions
> > +   the same way without it but in testing this has not been the case
> > +   for GCC. Improving GCC to reliably schedule instructions ideally
> > +   cannot be easily done.
> > +
> > +   Architecture(s) that use the reassociation barries are:
>
> barriers

Fixed in V11.
>
> > +   x86
> > +
> > +   Note it is very unlikely the reassociation barriers would
> > +   de-optimize performance on any architecture and with an imperfect
> > +   compiler it may help performance, especially on out-of-order cpus,
> > +   so it is suggested that the respective maintainers add them.
> > +
> > +   architecture maintainers are encouraged to benchmark this with
>
> Architecture

Fixed in V11.
>
> > +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> > +*/
> > +
> > +
> > +#ifndef __asm_reassociation_barrier
> > +# define __asm_reassociation_barrier(...)
> > +#endif
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *str)
> > +{
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = s[0];
> > +      /* Since hashed string is normally not empty, this is unlikely on the
> > +      first iteration of the loop.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +     return h;
> > +
> > +      c1 = s[1];
> > +      if (c1 == 0)
> > +     {
> > +       /* Ideal computational order is:
> > +      c0 += h;
> > +      h *= 32;
> > +      h += c0;  */
> > +       c0 += h;
> > +       __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> > +       h = h * 32 + c0;
> > +       return h;
> > +     }
> > +
> > +      /* Ideal computational order is:
> > +      c1 += c0;
> > +      h *= 33 * 33;
> > +      c0 *= 32;
> > +      c1 += c0;
> > +      h  += c1;  */
> > +      c1 += c0;
> > +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      __asm_reassociation_barrier("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> > +}
> > +
>
> OK.
>
> > +#endif /* dl-new-hash.h */
> > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..ce8fb5a838
> > --- /dev/null
> > +++ b/sysdeps/x86/dl-new-hash.h
> > @@ -0,0 +1,24 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifdef __asm_reassociation_barrier
> > +# error "__asm_reassociation_barrier should never already be defined."
> > +#endif
> > +
> > +#define __asm_reassociation_barrier __asm__
> > +#include <sysdeps/generic/dl-new-hash.h>
>
> OK.

^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v11 3/6] nss: Add tests for the nss_hash in nss_hash.h
  2022-05-19 22:18   ` [PATCH v11 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
@ 2022-05-23  7:42     ` Siddhesh Poyarekar
  0 siblings, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-23  7:42 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 20/05/2022 03:48, Noah Goldstein via Libc-alpha wrote:
> If we want to further optimize the function tests are needed.
> ---
>   nss/Makefile          |  1 +
>   nss/nss_hash.c        |  1 +
>   nss/simple-nss-hash.h | 42 +++++++++++++++++++++++
>   nss/tst-nss-hash.c    | 80 +++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 124 insertions(+)
>   create mode 100644 nss/simple-nss-hash.h
>   create mode 100644 nss/tst-nss-hash.c


LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> 
> diff --git a/nss/Makefile b/nss/Makefile
> index d8b06b44fb..a978e3927a 100644
> --- a/nss/Makefile
> +++ b/nss/Makefile
> @@ -62,6 +62,7 @@ tests := \
>     test-digits-dots \
>     test-netdb \
>     tst-nss-getpwent \
> +  tst-nss-hash \
>     tst-nss-test1 \
>     tst-nss-test2 \
>     tst-nss-test4 \
> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index 27a348ea9b..3d8e4cf37e 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -75,4 +75,5 @@ __nss_hash (const void *keyarg, size_t len)
>     return h;
>   }
>   
> +
>   libc_hidden_def (__nss_hash)
> diff --git a/nss/simple-nss-hash.h b/nss/simple-nss-hash.h
> new file mode 100644
> index 0000000000..47708972e7
> --- /dev/null
> +++ b/nss/simple-nss-hash.h
> @@ -0,0 +1,42 @@
> +/* __simple_nss_hash for testing nss_hash function
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SIMPLE_NSS_HASH_H
> +#define _SIMPLE_NSS_HASH_H 1
> +
> +#include <stdint.h>
> +
> +/* For testing/benchmarking purposes.  Real implementation in
> +   nss/nss_hash.c.  */
> +static uint32_t
> +__attribute__ ((unused))
> +__simple_nss_hash (const void *keyarg, size_t len)
> +{
> +  const unsigned char *key;
> +  size_t i;
> +  uint32_t h = 0;
> +  key = keyarg;
> +
> +  for (i = 0; i < len; ++i)
> +    h = *key++ + 65599 * h;
> +
> +  return h;
> +}
> +
> +
> +#endif /* simple-nss-hash.h */
> diff --git a/nss/tst-nss-hash.c b/nss/tst-nss-hash.c
> new file mode 100644
> index 0000000000..5ec1f9b0c5
> --- /dev/null
> +++ b/nss/tst-nss-hash.c
> @@ -0,0 +1,80 @@
> +/* Test __nss_hash
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <support/support.h>
> +#include <support/check.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +#include <nss.h>
> +#include <simple-nss-hash.h>
> +
> +uint32_t __nss_hash (const void *__key, size_t __length);
> +
> +static int
> +do_fill_tests (size_t len, int fill)
> +{
> +  uint32_t expec, res;
> +  char buf[len];
> +  memset (buf, fill, len);
> +
> +  expec = __simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: fill(%d) (%zu), %x != %x\n", fill, len, expec, res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_rand_tests (size_t len)
> +{
> +  uint32_t expec, res;
> +  size_t i;
> +  char buf[len];
> +  for (i = 0; i < len; ++i)
> +    buf[i] = random ();
> +
> +  expec = __simple_nss_hash (buf, len);
> +  res = __nss_hash (buf, len);
> +  if (expec != res)
> +    FAIL_EXIT1 ("FAIL: random (%zu), %x != %x\n", len, expec, res);
> +
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  size_t i, j;
> +  for (i = 0; i < 100; ++i)
> +    {
> +      for (j = 0; j < 8192; ++j)
> +	{
> +	  if (do_rand_tests (i))
> +	    return 1;
> +
> +	  if (do_fill_tests (i, -1) || do_fill_tests (i, 1)
> +	      || do_fill_tests (i, 0x80) || do_fill_tests (i, 0x88))
> +	    return 1;
> +	}
> +    }
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v11 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash
  2022-05-19 22:18   ` [PATCH v11 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
@ 2022-05-23  7:44     ` Siddhesh Poyarekar
  0 siblings, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-23  7:44 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 20/05/2022 03:48, Noah Goldstein via Libc-alpha wrote:
> Benchtests are for throughput and include random / fixed size
> benchmarks.
> ---
>   benchtests/Makefile                  |  25 ++++-
>   benchtests/README                    |   9 +-
>   benchtests/bench-dl-elf-hash.c       |  27 +++++
>   benchtests/bench-dl-new-hash.c       |  25 +++++
>   benchtests/bench-hash-funcs-kernel.h |  86 ++++++++++++++++
>   benchtests/bench-hash-funcs.c        | 145 +++++++++++++++++++++++++++
>   benchtests/bench-nss-hash.c          |  26 +++++
>   7 files changed, 335 insertions(+), 8 deletions(-)
>   create mode 100644 benchtests/bench-dl-elf-hash.c
>   create mode 100644 benchtests/bench-dl-new-hash.c
>   create mode 100644 benchtests/bench-hash-funcs-kernel.h
>   create mode 100644 benchtests/bench-hash-funcs.c
>   create mode 100644 benchtests/bench-nss-hash.c

OK.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index de9de5cf58..c279041e19 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -227,6 +227,12 @@ LOCALES := \
>   include ../gen-locales.mk
>   endif
>   
> +hash-benchset := \
> +  dl-elf-hash \
> +  dl-new-hash \
> +  nss-hash \
> +# hash-benchset
> +
>   stdlib-benchset := strtod
>   
>   stdio-common-benchset := sprintf
> @@ -235,7 +241,7 @@ math-benchset := math-inlines
>   
>   ifeq (${BENCHSET},)
>   benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
> -	    $(math-benchset)
> +	    $(math-benchset) $(hash-benchset)
>   else
>   benchset := $(foreach B,$(filter %-benchset,${BENCHSET}), ${${B}})
>   endif
> @@ -363,9 +369,20 @@ bench-clean:
>   
>   # Validate the passed in BENCHSET
>   ifneq ($(strip ${BENCHSET}),)
> -VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
> -   wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
> -   malloc-thread malloc-simple
> +VALIDBENCHSETNAMES := \
> +  bench-math \
> +  bench-pthread \
> +  bench-string \
> +  hash-benchset \
> +  malloc-simple \
> +  malloc-thread \
> +  math-benchset \
> +  stdio-common-benchset \
> +  stdlib-benchset \
> +  string-benchset \
> +  wcsmbs-benchset \
> +# VALIDBENCHSETNAMES
> +
>   INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
>   ifneq (${INVALIDBENCHSETNAMES},)
>   $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
> diff --git a/benchtests/README b/benchtests/README
> index 4d83a05b4b..998ba9b2b4 100644
> --- a/benchtests/README
> +++ b/benchtests/README
> @@ -84,12 +84,13 @@ where BENCHSET may be a space-separated list of the following values:
>       bench-math
>       bench-pthread
>       bench-string
> +    hash-benchset
> +    malloc-thread
> +    math-benchset
> +    stdio-common-benchset
> +    stdlib-benchset
>       string-benchset
>       wcsmbs-benchset
> -    stdlib-benchset
> -    stdio-common-benchset
> -    math-benchset
> -    malloc-thread
>   
>   Adding a function to benchtests:
>   ===============================
> diff --git a/benchtests/bench-dl-elf-hash.c b/benchtests/bench-dl-elf-hash.c
> new file mode 100644
> index 0000000000..067de9fca4
> --- /dev/null
> +++ b/benchtests/bench-dl-elf-hash.c
> @@ -0,0 +1,27 @@
> +/* Measure __dl_new_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <dl-hash.h>
> +#include <elf/simple-dl-hash.h>
> +#define TEST_FUNC(x, y) _dl_elf_hash (x)
> +#define SIMPLE_TEST_FUNC(x, y) __simple_dl_elf_hash (x)
> +
> +#define TEST_NAME "_dl_elf_hash"
> +
> +
> +#include "bench-hash-funcs.c"
> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> new file mode 100644
> index 0000000000..3c8a1d5a82
> --- /dev/null
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -0,0 +1,25 @@
> +/* Measure __dl_new_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <elf/dl-new-hash.h>
> +#define TEST_FUNC(x, y) _dl_new_hash (x)
> +#define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
> +
> +#define TEST_NAME "_dl_new_hash"
> +
> +#include "bench-hash-funcs.c"
> diff --git a/benchtests/bench-hash-funcs-kernel.h b/benchtests/bench-hash-funcs-kernel.h
> new file mode 100644
> index 0000000000..83995cc0ae
> --- /dev/null
> +++ b/benchtests/bench-hash-funcs-kernel.h
> @@ -0,0 +1,86 @@
> +/* Actual benchmark kernels used by bench-hash-funcs.h
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +
> +/* We go through the trouble of using macros here because many of the
> +   hash functions are meant to be inlined so its not fair to benchmark
> +   them with a function pointer where they won't be inlinable. */
> +#undef RUN_FUNC
> +#undef POSTFIX
> +#ifdef SIMPLE
> +# define RUN_FUNC SIMPLE_TEST_FUNC
> +# define POSTFIX _simple
> +#else
> +# define RUN_FUNC TEST_FUNC
> +# define POSTFIX _optimized
> +#endif
> +
> +#define PRIMITIVE_CAT(x, y) x ## y
> +#define CAT(x, y) PRIMITIVE_CAT (x, y)
> +
> +static double __attribute__ ((noinline, noclone))
> +CAT (do_one_test_kernel, POSTFIX) (const char *s, size_t len)
> +{
> +
> +  unsigned int iters;
> +  timing_t start, stop, cur;
> +
> +  /* Warmup.  */
> +  for (iters = NFIXED_ITERS / 32; iters; --iters)
> +    DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
> +
> +  TIMING_NOW (start);
> +  for (iters = NFIXED_ITERS; iters; --iters)
> +    DO_NOT_OPTIMIZE_OUT (RUN_FUNC (s, len));
> +
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  (void) (len);
> +  return (double) cur / (double) NFIXED_ITERS;
> +}
> +
> +static double __attribute__ ((noinline, noclone))
> +CAT (do_rand_test_kernel, POSTFIX) (char const *bufs,
> +				    unsigned int const *sizes)
> +{
> +  unsigned int i, iters;
> +  size_t offset;
> +  timing_t start, stop, cur;
> +
> +  /* Warmup.  */
> +  for (i = 0, offset = 0; i < NRAND_BUFS; ++i, offset += RAND_BENCH_MAX_LEN)
> +    DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
> +
> +  TIMING_NOW (start);
> +  for (iters = NRAND_ITERS; iters; --iters)
> +    {
> +      for (i = 0, offset = 0; i < NRAND_BUFS;
> +	   ++i, offset += RAND_BENCH_MAX_LEN)
> +	DO_NOT_OPTIMIZE_OUT (RUN_FUNC (bufs + offset, sizes[i]));
> +
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  (void) (sizes);
> +  return (double) cur / (double) (NRAND_ITERS * NRAND_BUFS);
> +}
> diff --git a/benchtests/bench-hash-funcs.c b/benchtests/bench-hash-funcs.c
> new file mode 100644
> index 0000000000..578c5cbae2
> --- /dev/null
> +++ b/benchtests/bench-hash-funcs.c
> @@ -0,0 +1,145 @@
> +/* Measure hash functions runtime.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#ifndef TEST_FUNC
> +# error "No TEST_FUNC provided!"
> +#endif
> +#ifndef SIMPLE_TEST_FUNC
> +# error "No SIMPLE_TEST_FUNC provided!"
> +#endif
> +
> +#ifndef TEST_NAME
> +# define STRINGIFY_PRIMITIVE(x) #  x
> +# define STRINGIFY(x) STRINGIFY_PRIMITIVE (x)
> +
> +# define TEST_NAME STRINGIFY (TEST_FUNC)
> +#endif
> +
> +#include "json-lib.h"
> +#include "bench-timing.h"
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#define DO_NOT_OPTIMIZE_OUT(x) __asm__ volatile("" : : "r,m"(x) : "memory")
> +
> +enum
> +{
> +  NFIXED_ITERS = 1048576,
> +  NRAND_BUFS = 16384,
> +  NRAND_ITERS = 2048,
> +  RAND_BENCH_MAX_LEN = 128
> +};
> +
> +#include "bench-hash-funcs-kernel.h"
> +#define SIMPLE
> +#include "bench-hash-funcs-kernel.h"
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, size_t len)
> +{
> +  char buf[len + 1];
> +  memset (buf, -1, len);
> +  buf[len] = '\0';
> +
> +  json_element_object_begin (json_ctx);
> +
> +  json_attr_string (json_ctx, "type", "fixed");
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_double (json_ctx, "time_simple", do_one_test_kernel_simple (buf, len));
> +  json_attr_double (json_ctx, "time_optimized", do_one_test_kernel_optimized (buf, len));
> +
> +  json_element_object_end (json_ctx);
> +}
> +
> +static void __attribute__ ((noinline, noclone))
> +do_rand_test (json_ctx_t *json_ctx)
> +{
> +  size_t i, sz, offset;
> +  char *bufs;
> +  unsigned int *sizes;
> +
> +  bufs = (char *) calloc (NRAND_BUFS, RAND_BENCH_MAX_LEN);
> +  sizes = (unsigned int *) calloc (NRAND_BUFS, sizeof (unsigned int));
> +  if (bufs == NULL || sizes == NULL)
> +    {
> +      fprintf (stderr, "Failed to allocate bufs for random test\n");
> +      goto done;
> +    }
> +
> +  for (sz = 2; sz <= RAND_BENCH_MAX_LEN; sz += sz)
> +    {
> +      json_element_object_begin (json_ctx);
> +      json_attr_string (json_ctx, "type", "random");
> +      json_attr_uint (json_ctx, "length", sz);
> +
> +      for (i = 0, offset = 0; i < NRAND_BUFS;
> +	   ++i, offset += RAND_BENCH_MAX_LEN)
> +	{
> +	  sizes[i] = random () % sz;
> +	  memset (bufs + offset, -1, sizes[i]);
> +	  bufs[offset + sizes[i]] = '\0';
> +	}
> +
> +      json_attr_double (json_ctx, "time_simple",
> +			do_rand_test_kernel_simple (bufs, sizes));
> +      json_attr_double (json_ctx, "time_optimized",
> +			do_rand_test_kernel_optimized (bufs, sizes));
> +      json_element_object_end (json_ctx);
> +    }
> +
> +done:
> +  if (bufs)
> +    free (bufs);
> +
> +  if (sizes)
> +    free (sizes);
> +}
> +
> +static int
> +do_test (void)
> +{
> +  int i;
> +  json_ctx_t json_ctx;
> +
> +  json_init (&json_ctx, 0, stdout);
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_array_begin (&json_ctx, "results");
> +
> +  for (i = 0; i < 16; ++i)
> +    do_one_test (&json_ctx, i);
> +
> +  for (i = 16; i <= 256; i += i)
> +    do_one_test (&json_ctx, i);
> +
> +  do_rand_test (&json_ctx);
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/benchtests/bench-nss-hash.c b/benchtests/bench-nss-hash.c
> new file mode 100644
> index 0000000000..7e369428a2
> --- /dev/null
> +++ b/benchtests/bench-nss-hash.c
> @@ -0,0 +1,26 @@
> +/* Measure __nss_hash runtime
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nss.h>
> +#include <nss/simple-nss-hash.h>
> +#define TEST_FUNC __nss_hash
> +#define SIMPLE_TEST_FUNC __simple_nss_hash
> +
> +uint32_t __nss_hash (const void *__key, size_t __length);
> +
> +#include "bench-hash-funcs.c"


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v11 5/6] nss: Optimize nss_hash in nss_hash.c
  2022-05-19 22:18   ` [PATCH v11 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
@ 2022-05-23  7:44     ` Siddhesh Poyarekar
  0 siblings, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-23  7:44 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

On 20/05/2022 03:48, Noah Goldstein via Libc-alpha wrote:
> The prior unrolling didn't really do much as it left the dependency
> chain between iterations. Unrolled the loop for 4 so 4x multiplies
> could be pipelined in out-of-order machines.
> 
> Results for __nss_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.845
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    4.019,    3.729,               1.078
>   fixed,      1,     4.95,    5.707,               0.867
>   fixed,      2,    5.152,    5.657,               0.911
>   fixed,      3,    4.641,    5.721,               0.811
>   fixed,      4,    5.551,     5.81,               0.955
>   fixed,      5,    6.525,    6.552,               0.996
>   fixed,      6,    6.711,    6.561,               1.023
>   fixed,      7,    6.715,    6.767,               0.992
>   fixed,      8,    7.874,    7.915,               0.995
>   fixed,      9,    8.888,    9.767,                0.91
>   fixed,     10,    8.959,    9.762,               0.918
>   fixed,     11,    9.188,    9.987,                0.92
>   fixed,     12,    9.708,   10.618,               0.914
>   fixed,     13,   10.393,    11.14,               0.933
>   fixed,     14,   10.628,   12.097,               0.879
>   fixed,     15,   10.982,   12.965,               0.847
>   fixed,     16,   11.851,   14.429,               0.821
>   fixed,     32,   24.334,   34.414,               0.707
>   fixed,     64,   55.618,   86.688,               0.642
>   fixed,    128,  118.261,   224.36,               0.527
>   fixed,    256,  256.183,  538.629,               0.476
> random,      2,   11.194,   11.556,               0.969
> random,      4,   17.516,   17.205,               1.018
> random,      8,   23.501,   20.985,                1.12
> random,     16,   28.131,   29.212,               0.963
> random,     32,   35.436,   38.662,               0.917
> random,     64,    45.74,   58.868,               0.777
> random,    128,   75.394,  121.963,               0.618
> random,    256,  139.524,  260.726,               0.535
> ---
>   nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
>   1 file changed, 42 insertions(+), 37 deletions(-)

OK.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> 
> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index 3d8e4cf37e..1d3787e675 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -19,58 +19,63 @@
>   
>   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
>   /*
> - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
>    * units.  On the first time through the loop we get the "leftover bytes"
> - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> - * this routine is heavily used enough, it's worth the ugly coding.
> + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> + * HASHC. Further unrolling does not appear to help.
>    *
>    * OZ's original sdbm hash
>    */
>   uint32_t
>   __nss_hash (const void *keyarg, size_t len)
>   {
> +  enum
> +  {
> +    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
> +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> +  };
> +
>     const unsigned char *key;
> -  size_t loop;
>     uint32_t h;
>   
> -#define HASHC   h = *key++ + 65599 * h
> +#define HASHC	h = *key++ + HASH_CONST_P1 * h
>   
>     h = 0;
>     key = keyarg;
>     if (len > 0)
>       {
> -      loop = (len + 8 - 1) >> 3;
> -      switch (len & (8 - 1))
> -        {
> -        case 0:
> -          do
> -            {
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 7:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 6:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 5:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 4:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 3:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 2:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 1:
> -              HASHC;
> -            }
> -	  while (--loop);
> -        }
> +      switch ((len & (4 - 1)))
> +	{
> +	case 0:
> +	  /* h starts out as zero so no need to include the multiply. */
> +	  h = *key++;
> +	  /* FALLTHROUGH */
> +	case 3:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 2:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 1:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	}
> +
> +      uint32_t c0, c1, c2, c3;
> +      for (--len; len >= 4; len -= 4)
> +	{
> +	  c0 = (unsigned char) *(key + 0);
> +	  c1 = (unsigned char) *(key + 1);
> +	  c2 = (unsigned char) *(key + 2);
> +	  c3 = (unsigned char) *(key + 3);
> +	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> +	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> +
> +	  key += 4;
> +	}
>       }
>     return h;
>   }


^ permalink raw reply	[flat|nested] 167+ messages in thread

* Re: [PATCH v11 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h
  2022-05-19 22:18   ` [PATCH v11 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
@ 2022-05-23  7:46     ` Siddhesh Poyarekar
  0 siblings, 0 replies; 167+ messages in thread
From: Siddhesh Poyarekar @ 2022-05-23  7:46 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha; +Cc: Alexander Monakov

On 20/05/2022 03:48, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. The unrolling allows for
> pipelined multiplies.
> 
> As well, as an optional sysdep, reorder the operations and prevent
> reassosiation for better scheduling and higher ILP. This commit
> only adds the barrier for x86, although it should be either no
> change or a win for any architecture.
> 
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    2.865,     2.72,               1.053
>   fixed,      1,    3.567,    2.489,               1.433
>   fixed,      2,    2.577,    3.649,               0.706
>   fixed,      3,    3.644,    5.983,               0.609
>   fixed,      4,    4.211,    6.833,               0.616
>   fixed,      5,    4.741,    9.372,               0.506
>   fixed,      6,    5.415,    9.561,               0.566
>   fixed,      7,    6.649,   10.789,               0.616
>   fixed,      8,    8.081,   11.808,               0.684
>   fixed,      9,    8.427,   12.935,               0.651
>   fixed,     10,    8.673,   14.134,               0.614
>   fixed,     11,    10.69,   15.408,               0.694
>   fixed,     12,   10.789,   16.982,               0.635
>   fixed,     13,   12.169,   18.411,               0.661
>   fixed,     14,   12.659,   19.914,               0.636
>   fixed,     15,   13.526,   21.541,               0.628
>   fixed,     16,   14.211,   23.088,               0.616
>   fixed,     32,   29.412,   52.722,               0.558
>   fixed,     64,    65.41,  142.351,               0.459
>   fixed,    128,  138.505,  295.625,               0.469
>   fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>   benchtests/bench-dl-new-hash.c              |   3 +-
>   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>   elf/tst-dl-hash.c                           |   1 +
>   sysdeps/generic/dl-new-hash.h               | 109 ++++++++++++++++++++
>   sysdeps/x86/dl-new-hash.h                   |  24 +++++
>   5 files changed, 144 insertions(+), 13 deletions(-)
>   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>   create mode 100644 sysdeps/generic/dl-new-hash.h
>   create mode 100644 sysdeps/x86/dl-new-hash.h

OK.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> 
> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> index 3c8a1d5a82..040fa7ce01 100644
> --- a/benchtests/bench-dl-new-hash.c
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -16,7 +16,8 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#include <elf/dl-new-hash.h>
> +#include <dl-new-hash.h>
> +#include <elf/simple-dl-new-hash.h>
>   #define TEST_FUNC(x, y) _dl_new_hash (x)
>   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>   
> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> similarity index 75%
> rename from elf/dl-new-hash.h
> rename to elf/simple-dl-new-hash.h
> index 8641bb4196..1437b1bd36 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/simple-dl-new-hash.h
> @@ -1,4 +1,4 @@
> -/* _dl_new_hash for elf symbol lookup
> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>      Copyright (C) 2022 Free Software Foundation, Inc.
>      This file is part of the GNU C Library.
>   
> @@ -16,16 +16,16 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#ifndef _DL_NEW_HASH_H
> -#define _DL_NEW_HASH_H 1
> +#ifndef _SIMPLE_DL_NEW_HASH_H
> +#define _SIMPLE_DL_NEW_HASH_H 1
>   
>   #include <stdint.h>
> -/* For __always_inline.  */
> -#include <sys/cdefs.h>
>   
> -static __always_inline uint32_t
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-new-hash.h.  */
> +static uint32_t
>   __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +__simple_dl_new_hash (const char *s)
>   {
>     uint32_t h = 5381;
>     for (unsigned char c = *s; c != '\0'; c = *++s)
> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>     return h;
>   }
>   
> -/* For testing/benchmarking purposes.  */
> -#define __simple_dl_new_hash _dl_new_hash
> -
> -
> -#endif /* dl-new-hash.h */
> +#endif /* simple-dl-new-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> index 8697eb73a0..b21766c63d 100644
> --- a/elf/tst-dl-hash.c
> +++ b/elf/tst-dl-hash.c
> @@ -18,6 +18,7 @@
>   
>   
>   #include <simple-dl-hash.h>
> +#include <simple-dl-new-hash.h>
>   #include <dl-hash.h>
>   #include <dl-new-hash.h>
>   #include <support/support.h>
> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> new file mode 100644
> index 0000000000..59bfb0e1de
> --- /dev/null
> +++ b/sysdeps/generic/dl-new-hash.h
> @@ -0,0 +1,109 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline and __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +   _dl_new_hash (const char *s)
> +   {
> +      uint32_t h = 5381;
> +      for (unsigned char c = *s; c != '\0'; c = *++s)
> +        h = h * 33 + c;
> +      return h;
> +   }
> +
> +   We can get better performance by slightly unrolling the loop to
> +   pipeline the multiples, which gcc cannot easily do due to
> +   dependencies across iterations.
> +
> +   As well, as an architecture specific option we add asm statements
> +   to explicitly specify order of operations and prevent reassociation
> +   of instructions that lengthens the loop carried dependency. This
> +   may have no affect as the compiler may have ordered instructions
> +   the same way without it but in testing this has not been the case
> +   for GCC. Improving GCC to reliably schedule instructions ideally
> +   cannot be easily done.
> +
> +   Architecture(s) that use the reassociation barriers are:
> +   x86
> +
> +   Note it is very unlikely the reassociation barriers would
> +   de-optimize performance on any architecture and with an imperfect
> +   compiler it may help performance, especially on out-of-order cpus,
> +   so it is suggested that the respective maintainers add them.
> +
> +   Architecture maintainers are encouraged to benchmark this with
> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> +*/
> +
> +
> +#ifndef __asm_reassociation_barrier
> +# define __asm_reassociation_barrier(...)
> +#endif
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *str)
> +{
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Since hashed string is normally not empty, this is unlikely on the
> +	 first iteration of the loop.  */
> +      if (__glibc_unlikely (c0 == 0))
> +	return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +	{
> +	  /* Ideal computational order is:
> +	 c0 += h;
> +	 h *= 32;
> +	 h += c0;  */
> +	  c0 += h;
> +	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> +	  h = h * 32 + c0;
> +	  return h;
> +	}
> +
> +      /* Ideal computational order is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;  */
> +      c1 += c0;
> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      __asm_reassociation_barrier("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
> +}
> +
> +#endif /* dl-new-hash.h */
> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> new file mode 100644
> index 0000000000..ce8fb5a838
> --- /dev/null
> +++ b/sysdeps/x86/dl-new-hash.h
> @@ -0,0 +1,24 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifdef __asm_reassociation_barrier
> +# error "__asm_reassociation_barrier should never already be defined."
> +#endif
> +
> +#define __asm_reassociation_barrier __asm__
> +#include <sysdeps/generic/dl-new-hash.h>


^ permalink raw reply	[flat|nested] 167+ messages in thread

end of thread, other threads:[~2022-05-23  7:46 UTC | newest]

Thread overview: 167+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-14  4:12 [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-04-14  4:12 ` [PATCH v1 2/6] elf: Add tests for the hash functions in dl-hash.h Noah Goldstein
2022-04-14  4:12 ` [PATCH v1 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-04-14  4:12 ` [PATCH v1 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-04-14  4:12 ` [PATCH v1 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-04-14  4:12 ` [PATCH v1 6/6] elf: Optimize __dl_new_hash in dl-hash.h Noah Goldstein
2022-04-14  4:32 ` [PATCH v1 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked H.J. Lu
2022-04-14 14:56   ` Noah Goldstein
2022-04-14 14:55 ` [PATCH v2 " Noah Goldstein
2022-04-14 14:55   ` [PATCH v2 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-04-25 15:39     ` Florian Weimer
2022-04-25 15:59       ` Noah Goldstein
2022-04-14 14:55   ` [PATCH v2 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-04-25 15:38     ` Florian Weimer
2022-04-25 15:58       ` Noah Goldstein
2022-04-26  8:35         ` Florian Weimer
2022-04-26 21:39           ` Noah Goldstein
2022-04-27 10:48             ` Florian Weimer
2022-04-27 15:02               ` Noah Goldstein
2022-04-14 14:55   ` [PATCH v2 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-04-14 14:55   ` [PATCH v2 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-04-14 14:55   ` [PATCH v2 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-04-25 15:58 ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-04-25 15:58   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-04-25 15:58   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-04-25 15:58   ` [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-04-25 15:58   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-04-25 15:58   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-04-25 16:01   ` [PATCH v3 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Adhemerval Zanella
2022-04-25 16:18     ` Noah Goldstein
2022-04-25 15:59 ` [PATCH v1 " Adhemerval Zanella
2022-04-25 16:16   ` Noah Goldstein
2022-04-25 16:35 ` [PATCH v3 " Noah Goldstein
2022-04-25 16:35   ` [PATCH v3 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-04-25 16:35   ` [PATCH v3 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-04-27 10:39     ` Florian Weimer
2022-04-27 16:24       ` Noah Goldstein
2022-04-25 16:35   ` [PATCH v3 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-04-25 16:36   ` [PATCH v3 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-04-27 10:47     ` Florian Weimer
2022-04-25 16:36   ` [PATCH v3 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-04-27 10:43     ` Florian Weimer
2022-04-27 16:25       ` Noah Goldstein
2022-04-27 15:02     ` Alexander Monakov
     [not found]       ` <CAFUsyfKeocq4VAusvnggq-NR=tOQTjrD0Z6r3CYCTjGQ=tGGSw@mail.gmail.com>
     [not found]         ` <f54f1ec9-fc31-283f-bce9-59fd8bda98ad@ispras.ru>
2022-04-27 16:23           ` Noah Goldstein
2022-04-28 18:03             ` Alexander Monakov
2022-05-04 18:04               ` Alexander Monakov
2022-05-05 11:07                 ` Alexander Monakov
2022-05-05 15:10                   ` Noah Goldstein
2022-05-05 15:26                     ` Alexander Monakov
2022-05-05 18:03                       ` Noah Goldstein
2022-05-05 19:37                         ` Alexander Monakov
2022-05-05 22:51                           ` Noah Goldstein
2022-04-27 16:19 ` [PATCH v4 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-04-27 16:19   ` [PATCH v4 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-04-27 16:19   ` [PATCH v4 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-04-27 16:20   ` [PATCH v4 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-04-27 16:20   ` [PATCH v4 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-04-27 16:20   ` [PATCH v4 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-09 17:17 ` [PATCH v5 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-05-09 17:17   ` [PATCH v5 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-09 17:17   ` [PATCH v5 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-09 17:17   ` [PATCH v5 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-09 17:17   ` [PATCH v5 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-09 17:17   ` [PATCH v5 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-10 11:58     ` Adhemerval Zanella
2022-05-10 15:04 ` [PATCH v6 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-05-10 15:04   ` [PATCH v6 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-10 15:04   ` [PATCH v6 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-10 15:04   ` [PATCH v6 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-10 15:04   ` [PATCH v6 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-10 15:04   ` [PATCH v6 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-10 15:29     ` H.J. Lu
2022-05-10 15:31       ` H.J. Lu
2022-05-10 16:49     ` Alexander Monakov
2022-05-10 17:17       ` Noah Goldstein
2022-05-10 17:40         ` Alexander Monakov
2022-05-10 23:30 ` [PATCH v7 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-05-10 23:30   ` [PATCH v7 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-10 23:30   ` [PATCH v7 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-10 23:30   ` [PATCH v7 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-10 23:30   ` [PATCH v7 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-10 23:30   ` [PATCH v7 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-10 23:46     ` H.J. Lu
2022-05-11  3:07       ` Noah Goldstein
2022-05-11  3:06 ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein
2022-05-11  3:06   ` [PATCH v8 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-11  3:06   ` [PATCH v8 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-11  3:06   ` [PATCH v8 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-11  3:06   ` [PATCH v8 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-11  3:06   ` [PATCH v8 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-16 14:12     ` Siddhesh Poyarekar
2022-05-16 14:31       ` Alexander Monakov
2022-05-16 16:23         ` Siddhesh Poyarekar
2022-05-16 16:38           ` Noah Goldstein
2022-05-16 16:44             ` Siddhesh Poyarekar
2022-05-16 20:32               ` Noah Goldstein
2022-05-16 18:09       ` Alexander Monakov
2022-05-16 18:47         ` Siddhesh Poyarekar
2022-05-16 19:28           ` Alexander Monakov
2022-05-16 19:35             ` Noah Goldstein
2022-05-16 19:41               ` Alexander Monakov
2022-05-16 19:47                 ` Adhemerval Zanella
2022-05-16 20:00                   ` Alexander Monakov
2022-05-16 20:08                     ` Adhemerval Zanella
2022-05-16 20:27                       ` Alexander Monakov
2022-05-16 19:48                 ` Noah Goldstein
2022-05-16 20:33                   ` Alexander Monakov
2022-05-16 21:40                     ` Noah Goldstein
2022-05-17  1:45             ` Siddhesh Poyarekar
2022-05-16 13:56   ` [PATCH v8 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
2022-05-16 20:31     ` Noah Goldstein
2022-05-16 20:29 ` [PATCH v9 " Noah Goldstein
2022-05-16 20:30   ` [PATCH v9 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-17  4:19     ` Siddhesh Poyarekar
2022-05-18 17:29       ` Noah Goldstein
2022-05-16 20:30   ` [PATCH v9 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-17  4:32     ` Siddhesh Poyarekar
2022-05-18 17:30       ` Noah Goldstein
2022-05-16 20:30   ` [PATCH v9 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-17  4:52     ` Siddhesh Poyarekar
2022-05-18 17:33       ` Noah Goldstein
2022-05-16 20:30   ` [PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-17  5:11     ` Siddhesh Poyarekar
2022-05-18 17:34       ` Noah Goldstein
2022-05-18 17:35         ` Noah Goldstein
2022-05-16 20:30   ` [PATCH v9 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-17  5:12     ` Siddhesh Poyarekar
2022-05-18 17:38       ` Noah Goldstein
2022-05-19 15:59         ` Siddhesh Poyarekar
2022-05-19 16:54           ` DJ Delorie
2022-05-17  3:34   ` [PATCH v9 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
2022-05-18 17:28     ` Noah Goldstein
2022-05-18 17:26 ` [PATCH v10 " Noah Goldstein
2022-05-18 17:26   ` [PATCH v10 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-19 14:49     ` Siddhesh Poyarekar
2022-05-18 17:26   ` [PATCH v10 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-19 15:09     ` Siddhesh Poyarekar
2022-05-19 15:40       ` Siddhesh Poyarekar
2022-05-19 22:20         ` Noah Goldstein
2022-05-18 17:26   ` [PATCH v10 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-19 15:34     ` Siddhesh Poyarekar
2022-05-19 22:20       ` Noah Goldstein
2022-05-18 17:26   ` [PATCH v10 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-19 15:41     ` Siddhesh Poyarekar
2022-05-19 22:21       ` Noah Goldstein
2022-05-18 17:26   ` [PATCH v10 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-18 17:32     ` H.J. Lu
2022-05-18 17:39       ` Noah Goldstein
2022-05-19  7:53       ` Siddhesh Poyarekar
2022-05-19 15:55     ` Siddhesh Poyarekar
2022-05-19 22:22       ` Noah Goldstein
2022-05-19 14:47   ` [PATCH v10 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Siddhesh Poyarekar
2022-05-19 14:50     ` Noah Goldstein
2022-05-19 14:56       ` Siddhesh Poyarekar
2022-05-19 22:17 ` [PATCH v11 " Noah Goldstein
2022-05-19 22:17   ` [PATCH v11 2/6] elf: Add tests for the dl hash funcs (_dl_new_hash and _dl_elf_hash) Noah Goldstein
2022-05-19 22:19     ` Noah Goldstein
2022-05-19 22:18   ` [PATCH v11 3/6] nss: Add tests for the nss_hash in nss_hash.h Noah Goldstein
2022-05-23  7:42     ` Siddhesh Poyarekar
2022-05-19 22:18   ` [PATCH v11 4/6] benchtests: Add benchtests for dl_elf_hash, dl_new_hash and nss_hash Noah Goldstein
2022-05-23  7:44     ` Siddhesh Poyarekar
2022-05-19 22:18   ` [PATCH v11 5/6] nss: Optimize nss_hash in nss_hash.c Noah Goldstein
2022-05-23  7:44     ` Siddhesh Poyarekar
2022-05-19 22:18   ` [PATCH v11 6/6] elf: Optimize _dl_new_hash in dl-new-hash.h Noah Goldstein
2022-05-23  7:46     ` Siddhesh Poyarekar
2022-05-19 22:18   ` [PATCH v11 1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).