public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c
@ 2021-11-01  5:49 Noah Goldstein
  2021-11-01  5:49 ` [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
                   ` (7 more replies)
  0 siblings, 8 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-01  5:49 UTC (permalink / raw)
  To: libc-alpha

This commit updates the memcpy tests to test both dst > src and dst <
src. This is because there is logic in the code based on the
---
 string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
 string/test-memmove.c |  75 ++++++++++++++++++-
 2 files changed, 214 insertions(+), 28 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index c9e965bed3..3b0f3127b7 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -17,6 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #ifndef MEMCPY_RESULT
+# define DO_EXTRA_TESTS
 # define MEMCPY_RESULT(dst, len) dst
 # define MIN_PAGE_SIZE 131072
 # define TEST_MAIN
@@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 static void
 do_test (size_t align1, size_t align2, size_t len)
 {
-  size_t i, j;
+  size_t i, j, repeats;
   char *s1, *s2;
 
   align1 &= 4095;
@@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
 
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      for (i = 0, j = 1; i < len; i++, j += 23)
+        s1[i] = j;
 
-  for (i = 0, j = 1; i < len; i++, j += 23)
-    s1[i] = j;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (impl, s2, s1, len);
+    }
 }
 
 static void
@@ -212,56 +215,87 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (size_t size)
+do_test1 (size_t align1, size_t align2, size_t size)
 {
   void *large_buf;
-  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  size_t mmap_size, region_size;
+
+  align1 &= (page_size - 1);
+  if (align1 == 0)
+    align1 = page_size;
+
+  align2 &= (page_size - 1);
+  if (align2 == 0)
+    align2 = page_size;
+
+  region_size = (size + page_size - 1) & (~(page_size - 1));
+
+  mmap_size = region_size * 2 + 3 * page_size;
+  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
     {
-      puts ("Failed to allocat large_buf, skipping do_test1");
+      puts ("Failed to allocate large_buf, skipping do_test1");
       return;
     }
-
-  if (mprotect (large_buf + size, page_size, PROT_NONE))
+  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
     error (EXIT_FAILURE, errno, "mprotect failed");
 
-  size_t arrary_size = size / sizeof (uint32_t);
-  uint32_t *dest = large_buf;
-  uint32_t *src = large_buf + size + page_size;
+  size_t array_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf + align1;
+  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
   size_t i;
   size_t repeats;
   for(repeats = 0; repeats < 2; repeats++)
     {
-      for (i = 0; i < arrary_size; i++)
+      for (i = 0; i < array_size; i++)
         src[i] = (uint32_t) i;
-
       FOR_EACH_IMPL (impl, 0)
         {
-            printf ("\t\tRunning: %s\n", impl->name);
           memset (dest, -1, size);
           CALL (impl, (char *) dest, (char *) src, size);
-          for (i = 0; i < arrary_size; i++)
+          for (i = 0; i < array_size; i++)
         if (dest[i] != src[i])
           {
             error (0, 0,
                "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
                impl->name, dest, src, i);
             ret = 1;
-            munmap ((void *) large_buf, size * 2 + page_size);
+            munmap ((void *) large_buf, mmap_size);
             return;
           }
         }
-      dest = src;
-      src = large_buf;
+      dest = large_buf + region_size + 2 * page_size + align1;
+      src = large_buf + align2;
+    }
+  munmap ((void *) large_buf, mmap_size);
+}
+
+static void
+do_random_large_tests (void)
+{
+  size_t i, align1, align2, size;
+  for (i = 0; i < 32; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 0x1000000) + 0x200000;
+      do_test1 (align1, align2, size);
+    }
+
+  for (i = 0; i < 128; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 32768) + 4096;
+      do_test1 (align1, align2, size);
     }
-  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j;
 
   test_init ();
 
@@ -298,6 +332,7 @@ test_main (void)
   for (i = 19; i <= 25; ++i)
     {
       do_test (255, 0, 1 << i);
+      do_test (0, 4000, 1 << i);
       do_test (0, 255, i);
       do_test (0, 4000, i);
     }
@@ -306,8 +341,88 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test1 (0x100000);
-  do_test1 (0x2000000);
+  do_test1 (0, 0, 0x100000);
+  do_test1 (0, 0, 0x2000000);
+
+  for (i = 4096; i < 32768; i += 4096)
+    {
+      for (j = 1; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  for (i = 0x300000; i < 0x2000000; i += 0x235689)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+#ifdef DO_EXTRA_TESTS
+  for (i = 0x200000; i <= 0x2000000; i += i)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+
+          do_test1 (0, j, i + 1);
+          do_test1 (4095, j, i + 1);
+          do_test1 (4096 - j, 0, i + 1);
+
+          do_test1 (0, j - 1, i + 1);
+          do_test1 (4095, j - 1, i + 1);
+          do_test1 (4096 - j - 1, 0, i + 1);
+
+          do_test1 (0, j + 1, i + 1);
+          do_test1 (4095, j + 1, i + 1);
+          do_test1 (4096 - j, 1, i + 1);
+
+          do_test1 (0, j, i - 1);
+          do_test1 (4095, j, i - 1);
+          do_test1 (4096 - j, 0, i - 1);
+
+          do_test1 (0, j - 1, i - 1);
+          do_test1 (4095, j - 1, i - 1);
+          do_test1 (4096 - j - 1, 0, i - 1);
+
+          do_test1 (0, j + 1, i - 1);
+          do_test1 (4095, j + 1, i - 1);
+          do_test1 (4096 - j, 1, i - 1);
+        }
+    }
+#endif
+  do_random_large_tests ();
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index a0ce8b0334..5c6d1579e3 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize() - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize() - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
   munmap ((void *) buf, size);
 }
 
+static void
+do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
+{
+  size_t size, repeats, i;
+  uint8_t *buf, *dst, *src;
+
+  size = bytes_move + MAX(offset1, offset2);
+  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANON, -1, 0);
+
+  if (buf == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  dst = &buf[offset1];
+  src = &buf[offset2];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < bytes_move; i++)
+              src[i] = (uint8_t) i;
+#ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+#else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+#endif
+          for (i = 0; i < bytes_move; i++)
+            {
+              if (dst[i] != (uint8_t) i)
+                {
+                  error (0, 0,
+                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                         impl->name, dst, buf, i);
+                  ret = 1;
+                  break;
+                }
+            }
+        }
+      dst = &buf[offset2];
+      src = &buf[offset1];
+    }
+  munmap ((void *) buf, size);
+}
+
+
 int
 test_main (void)
 {
@@ -395,13 +440,39 @@ test_main (void)
 
   do_random_tests ();
 
+  do_test2 (0);
   do_test2 (33);
+  do_test2 (0x200000 - 1);
   do_test2 (0x200000);
+  do_test2 (0x200000 + 1);
+  do_test2 (0x1000000 - 1);
+  do_test2 (0x1000000);
+  do_test2 (0x1000000 + 1);
   do_test2 (0x4000000 - 1);
   do_test2 (0x4000000);
+  do_test2 (0x4000000 + 1);
 
   /* Copy 16KB data.  */
   do_test3 (16384, 3);
+  for (i = 4096; i <= 16384; i <<= 1)
+    {
+      do_test4 (i, 0, i);
+      do_test4 (i, 0, i - 1);
+      do_test4 (i, 0, i + 1);
+      do_test4 (i, 63, i + 63);
+      do_test4 (i, 63, i + 64);
+      do_test4 (i, 63, i);
+
+      do_test4 (i, 0, 1);
+      do_test4 (i, 0, 15);
+      do_test4 (i, 0, 31);
+      do_test4 (i, 0, 63);
+      do_test4 (i, 0, 64);
+      do_test4 (i, 0, 65);
+      do_test4 (i, 0, 127);
+      do_test4 (i, 0, 129);
+    }
+
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
@ 2021-11-01  5:49 ` Noah Goldstein
  2021-11-06  2:27   ` H.J. Lu
  2021-11-01  5:49 ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-01  5:49 UTC (permalink / raw)
  To: libc-alpha

This commit adds more benchmarks for the common memcpy/memmove
benchmarks. The most signifcant cases are the half page offsets. The
current versions leaves dst and src near page aligned which leads to
false 4k aliasing on x86_64. This can add noise due to false
dependencies from one run to the next. As well, this seems like more
of an edge case that common case so it shouldn't be the only thing
---
 benchtests/bench-memcpy.c  | 49 +++++++++++++++++++++++++++++++++-----
 benchtests/bench-memmove.c | 26 +++++++++++++++++---
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index d9236a2282..744bea26d3 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -40,7 +40,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, const char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -60,11 +63,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i, j;
   char *s1, *s2;
   size_t repeats;
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -99,7 +102,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  size_t half_page = getpagesize () / 2;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -121,8 +124,15 @@ test_main (void)
     {
       do_test (&json_ctx, 0, 0, 1 << i, 1);
       do_test (&json_ctx, i, 0, 1 << i, 1);
+      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
       do_test (&json_ctx, 0, i, 1 << i, 1);
+      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
       do_test (&json_ctx, i, i, 1 << i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
+      do_test (&json_ctx, half_page, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page, i, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
     }
 
   for (i = 0; i < 32; ++i)
@@ -131,16 +141,26 @@ test_main (void)
       do_test (&json_ctx, i, 0, i, 0);
       do_test (&json_ctx, 0, i, i, 0);
       do_test (&json_ctx, i, i, i, 0);
+      do_test (&json_ctx, half_page, 0, i, 0);
+      do_test (&json_ctx, half_page + i, 0, i, 0);
+      do_test (&json_ctx, half_page, i, i, 0);
+      do_test (&json_ctx, half_page + i, i, i, 0);
+      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
+      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
     }
 
   for (i = 3; i < 32; ++i)
     {
       if ((i & (i - 1)) == 0)
-	continue;
+        continue;
       do_test (&json_ctx, 0, 0, 16 * i, 1);
       do_test (&json_ctx, i, 0, 16 * i, 1);
       do_test (&json_ctx, 0, i, 16 * i, 1);
       do_test (&json_ctx, i, i, 16 * i, 1);
+      do_test (&json_ctx, half_page, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page, i, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
     }
 
   for (i = 32; i < 64; ++i)
@@ -149,16 +169,33 @@ test_main (void)
       do_test (&json_ctx, i, 0, 32 * i, 1);
       do_test (&json_ctx, 0, i, 32 * i, 1);
       do_test (&json_ctx, i, i, 32 * i, 1);
+      do_test (&json_ctx, half_page, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page, i, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
     }
 
   do_test (&json_ctx, 0, 0, getpagesize (), 1);
 
-  for (i = 0; i <= 32; ++i)
+  for (i = 0; i <= 48; ++i)
     {
       do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
     }
 
   json_array_end (&json_ctx);
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index 6becbf4782..855f4d0649 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -34,7 +34,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -53,11 +56,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -85,6 +88,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
+  size_t half_page = getpagesize () / 2;
 
   test_init ();
 
@@ -138,6 +142,22 @@ test_main (void)
       do_test (&json_ctx, i, i, 32 * i);
     }
 
+  for (i = 0; i <= 48; ++i)
+    {
+      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
+    }
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-01  5:49 ` [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
@ 2021-11-01  5:49 ` Noah Goldstein
  2021-11-06  2:28   ` H.J. Lu
  2021-11-01  5:49 ` [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-01  5:49 UTC (permalink / raw)
  To: libc-alpha

This commit adds a new partial overlap benchmark. This is generally
the most interesting performance case for memmove and was missing.
---
 benchtests/bench-memmove-walk.c | 61 +++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
index b5fdb2a422..2fb484c0ba 100644
--- a/benchtests/bench-memmove-walk.c
+++ b/benchtests/bench-memmove-walk.c
@@ -36,6 +36,10 @@
 # define TIMEOUT (20 * 60)
 # include "bench-string.h"
 
+#define NO_OVERLAP 0
+#define PARTIAL_OVERLAP 1
+#define COMPLETE_OVERLAP 2
+
 IMPL (memmove, 1)
 #endif
 
@@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
+do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
 {
-  json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) len);
-  json_array_begin (json_ctx, "timings");
+  char *s1, *s2, *tmp;
+  size_t repeats;
 
-  if (overlap)
-    buf2 = buf1;
+  s1 = (char *) (buf1);
+  s2 = (char *) (buf2);
+  if (overlap != NO_OVERLAP)
+    s2 = s1;
+  if (overlap == PARTIAL_OVERLAP)
+    s2 += len / 2;
 
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_uint (json_ctx, "length", (double) len);
+      json_attr_string(json_ctx, "overlap",
+                       overlap == NO_OVERLAP        ? "none"
+                       : overlap == PARTIAL_OVERLAP ? "partial"
+                                                    : "complete");
+      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
+      json_array_begin (json_ctx, "timings");
+
+
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
 
-  json_array_end (json_ctx);
-  json_element_object_end (json_ctx);
+      json_array_end (json_ctx);
+      json_element_object_end (json_ctx);
+
+      tmp = s1;
+      s1 = s2;
+      s2 = tmp;
+    }
 }
 
 int
@@ -107,15 +131,22 @@ test_main (void)
   /* Non-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, false);
-      do_test (&json_ctx, i + 1, false);
+      do_test (&json_ctx, i, NO_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
+    }
+
+  /* Partially-overlapping buffers.  */
+  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
+    {
+      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
     }
 
-  /* Overlapping buffers.  */
+  /* Complete-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, true);
-      do_test (&json_ctx, i + 1, true);
+      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
+      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
     }
 
   json_array_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-01  5:49 ` [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
  2021-11-01  5:49 ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
@ 2021-11-01  5:49 ` Noah Goldstein
  2021-11-01  5:52   ` Noah Goldstein
  2021-11-06  2:29   ` H.J. Lu
  2021-11-01  5:49 ` [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-01  5:49 UTC (permalink / raw)
  To: libc-alpha

No bug.

The optimizations are as follows:

1) Always align entry to 64 bytes. This makes behavior more
   predictable and makes other frontend optimizations easier.

2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
   significant benefits in the case that:
        0 < (dst - src) < [256, 512]

3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
   improvement and for FSRM [-10%, 25%].

In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
---
 sysdeps/x86_64/memmove.S                      |   2 +-
 .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
 6 files changed, 381 insertions(+), 224 deletions(-)

diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1f..b2b3180848 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
-
+#define MOV_SIZE	3
 #define SECTION(p)		p
 
 #ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 1ec1962e86..67a55f0c85 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f15..975ae6c051 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 848848ab39..0fa7126830 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 0cbce8f944..88715441fe 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index abde8438d4..7b27cbdda5 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@
 # endif
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
 #ifndef PAGE_SIZE
 # define PAGE_SIZE 4096
 #endif
@@ -199,25 +218,21 @@ L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
+	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
+	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
-	cmpb	$8, %dl
+	cmpl	$8, %edx
 	jae	L(between_8_15)
-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
 	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
 	movb	%cl, (%rdi)
-1:
+L(copy_0):
 	ret
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
+	ret
+#endif
+
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
+	ret
+#endif
+
 #if VEC_SIZE > 32
+	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	-32(%rsi, %rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
+
+	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
 	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER_RETURN
-L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 4
 L(more_8x_vec):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
 L(more_8x_vec_check):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
 
-	.p2align 4
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
 
+	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	.p2align 4
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
-	/* Compute absolute value of difference between source and
-	   destination.  */
-	movq	%rdi, %r9
-	subq	%rsi, %r9
-	movq	%r9, %r8
-	leaq	-1(%r9), %rcx
-	sarq	$63, %r8
-	xorq	%r8, %r9
-	subq	%r8, %r9
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache when
-	   source is loaded.  */
-	cmpq	%r9, %rdx
-	ja	L(more_8x_vec_check)
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
 
 	/* Cache align destination. First store the first 64 bytes then
 	   adjust alignments.  */
-	VMOVU	(%rsi), %VEC(8)
-#if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
-	VMOVU	%VEC(8), (%rdi)
-#if VEC_SIZE < 64
-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
 	/* Adjust source, destination, and size.  */
 	movq	%rdi, %r8
 	andq	$63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
 	/* Adjust length.  */
 	addq	%r8, %rdx
 
-	/* Test if source and destination addresses will alias. If they do
-	   the larger pipeline in large_memcpy_4x alleviated the
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
 	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
 
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
 	/* ecx stores inner loop counter.  */
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 L(loop_large_memcpy_4x_inner):
-	/* Only one prefetch set per page as doing 4 pages give more time
-	   for prefetcher to keep up.  */
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                   ` (2 preceding siblings ...)
  2021-11-01  5:49 ` [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-11-01  5:49 ` Noah Goldstein
  2021-11-06  2:31   ` H.J. Lu
  2021-11-06  2:27 ` [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-01  5:49 UTC (permalink / raw)
  To: libc-alpha

No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62
---
 sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd02..712b7c7fd0 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
     }
   else
     {
-      rep_movsb_threshold = 2048 * (16 / 16);
+      rep_movsb_threshold = 4096 * (16 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 16 * 8;
 #endif
@@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
+
+
+
   unsigned long int rep_movsb_stop_threshold;
   /* ERMS feature is implemented from AMD Zen3 architecture and it is
      performing poorly for data above L2 cache size. Henceforth, adding
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-01  5:49 ` [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-11-01  5:52   ` Noah Goldstein
  2021-11-06  2:29   ` H.J. Lu
  1 sibling, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-01  5:52 UTC (permalink / raw)
  To: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 34701 bytes --]

On Mon, Nov 1, 2021 at 12:50 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> The optimizations are as follows:
>
> 1) Always align entry to 64 bytes. This makes behavior more
>    predictable and makes other frontend optimizations easier.
>
> 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
>    significant benefits in the case that:
>         0 < (dst - src) < [256, 512]
>
> 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
>    improvement and for FSRM [-10%, 25%].
>
> In addition to these primary changes there is general cleanup
> throughout to optimize the aligning routines and control flow logic.
> ---

Benchmarks where run on:

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

Numbers are attached.

All numbers are geometric mean of N = 20 runs.

All numbers are reported as:

(New Time) / (Cur Time(ALIGN_ENTRY=N))

At the top is a breakdown of the performance changes for a given size
class.


Note on regressions:

There are four primary regressions:

1. For copy 1. This is because the copy 1 path was merged with the
   copy 2/3 path. Ultimately we see an improvement for copy 2/3 and
   regression for copy 1. Based on the SPEC2017 distribution copy 2/3
   is significantly hotter so I think this is likely okay. Does anyone
   think otherwise?

2. For copy [129, 256] for ALIGN_ENTRY=16,32,48 in the 4k aliasing
   case. I don't fully understand this result. if anyone does please
   let me know. My GUESS is that this is a benchmark artifact. One
   possible explination is the FE slowdowns for the 16,32,48 case
   allow the previous iterations stores to clear the SB before the
   next iterations loads. This might be preventing the 4k aliasing
   stalls, so despite the "slower" loop, they end up with better
   performance due to the nature of the benchmark (running in a loop
   with aliasing inputs). Its worth noting that in the non-4k aliasing
   case for the exact same sizes the ALIGN_ENTRY=64 is significantly
   faster (speedup greater than the slowdown in the 4k aliasing
   case). Since 4k aliasing covers ~12.5% of inputs, this seems worth
   it. Ultimately it seems acceptable. Does anyone think otherwise?

3. For copy [256, 383] the extra overhead for checking 4k aliasing
   seems to add a constant factor that does not pay off until 1
   iteration. This is a real judgement call. Again based on the
   SPEC2017 distribution the integral of sizes in [384, ...] is
   greater than sizes in [256, 383] so it seems worth it. Does anyone
   think otherwise?

4. Various pre-aligned cases in the 'rep movsb' case for FSRM
   (Tigerlake). Aligning before 'rep movsb' is a bit of a mixed
   bag. The results generally seem positive but there are certainly
   some regressions. In general performance appears to be in the range
   of [-15%, +15%]. There does appear to be more improvement than
   cost. It is also worth noting that from a profile of all memcpy
   usage by python3 running the pyperformance benchmark suite, the
   majority of destinations are not aligned:

    [ 4096, 8191]   -> Calls   : 111586         (78.808 )
    [ 8192, 16383]  -> Calls   : 369982         (85.567 )
    [16384, 32765]  -> Calls   : 70008          (79.052 )

   So I believe in general the improvements will speak louder than the
   regressions. As well for ERMS (which is implemented in the same
   file), the improvements are pretty dramatic in all
   cases. Ultimately this makes me think its worth it. Does anyone
   think otherwise.


>  sysdeps/x86_64/memmove.S                      |   2 +-
>  .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
>  .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
>  .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
>  .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
>  6 files changed, 381 insertions(+), 224 deletions(-)
>
> diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> index db106a7a1f..b2b3180848 100644
> --- a/sysdeps/x86_64/memmove.S
> +++ b/sysdeps/x86_64/memmove.S
> @@ -25,7 +25,7 @@
>  /* Use movups and movaps for smaller code sizes.  */
>  #define VMOVU          movups
>  #define VMOVA          movaps
> -
> +#define MOV_SIZE       3
>  #define SECTION(p)             p
>
>  #ifdef USE_MULTIARCH
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 1ec1962e86..67a55f0c85 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT                vmovntdq
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
> -
> +# define MOV_SIZE      4
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index e195e93f15..975ae6c051 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT                vmovntdq
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
> -
> +# define MOV_SIZE      4
>  # define SECTION(p)            p##.avx
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 848848ab39..0fa7126830 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE      6
>  # define SECTION(p)            p##.evex512
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx512_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 0cbce8f944..88715441fe 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE      6
>  # define SECTION(p)            p##.evex
>  # define MEMMOVE_SYMBOL(p,s)   p##_evex_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index abde8438d4..7b27cbdda5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -76,6 +76,25 @@
>  # endif
>  #endif
>
> +/* Whether to align before movsb. Ultimately we want 64 byte
> +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> +#define ALIGN_MOVSB    (VEC_SIZE > 16)
> +/* Number of bytes to align movsb to.  */
> +#define MOVSB_ALIGN_TO 64
> +
> +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> +
> +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> +# error MOV_SIZE Unknown
> +#endif
> +
> +#if LARGE_MOV_SIZE
> +# define SMALL_SIZE_OFFSET     (4)
> +#else
> +# define SMALL_SIZE_OFFSET     (0)
> +#endif
> +
>  #ifndef PAGE_SIZE
>  # define PAGE_SIZE 4096
>  #endif
> @@ -199,25 +218,21 @@ L(start):
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       /* Load regardless.  */
> +       VMOVU   (%rsi), %VEC(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(more_2x_vec)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(last_2x_vec):
> -#endif
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   (%rsi), %VEC(0)
>         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> -       ret
> +#if !(defined USE_MULTIARCH && IS_IN (libc))
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
>         VZEROUPPER_RETURN
>  #endif
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMMOVE_SYMBOL (__memmove, unaligned))
> -
>  # if VEC_SIZE == 16
>  ENTRY (__mempcpy_chk_erms)
>         cmp     %RDX_LP, %RCX_LP
> @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  # endif
>
> -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
>         movq    %rdi, %rax
>  L(start_erms):
>  # ifdef __ILP32__
> @@ -298,310 +313,448 @@ L(start_erms):
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       /* Load regardless.  */
> +       VMOVU   (%rsi), %VEC(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(movsb_more_2x_vec)
> -L(last_2x_vec):
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> +        */
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
>  L(return):
> -#if VEC_SIZE > 16
> +# if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
> -#else
> +# else
>         ret
> +# endif
>  #endif
>
> -L(movsb):
> -       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> -       jae     L(more_8x_vec)
> -       cmpq    %rsi, %rdi
> -       jb      1f
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       leaq    (%rsi,%rdx), %r9
> -       cmpq    %r9, %rdi
> -       /* Avoid slow backward REP MOVSB.  */
> -       jb      L(more_8x_vec_backward)
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rdi, %rcx
> -       subq    %rsi, %rcx
> -       jmp     2f
> -# endif
> -1:
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rsi, %rcx
> -       subq    %rdi, %rcx
> -2:
> -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> -   is N*4GB + [1..63] with N >= 0.  */
> -       cmpl    $63, %ecx
> -       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> -3:
> -# endif
> -       mov     %RDX_LP, %RCX_LP
> -       rep movsb
> -L(nop):
> +#if LARGE_MOV_SIZE
> +       /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> +          ENTRY block and L(less_vec).  */
> +       .p2align 4,, 8
> +L(between_4_7):
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       movl    (%rsi), %ecx
> +       movl    (%rsi, %rdx), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, (%rdi, %rdx)
>         ret
>  #endif
>
> +       .p2align 4
>  L(less_vec):
>         /* Less than 1 VEC.  */
>  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  # error Unsupported VEC_SIZE!
>  #endif
>  #if VEC_SIZE > 32
> -       cmpb    $32, %dl
> +       cmpl    $32, %edx
>         jae     L(between_32_63)
>  #endif
>  #if VEC_SIZE > 16
> -       cmpb    $16, %dl
> +       cmpl    $16, %edx
>         jae     L(between_16_31)
>  #endif
> -       cmpb    $8, %dl
> +       cmpl    $8, %edx
>         jae     L(between_8_15)
> -       cmpb    $4, %dl
> +#if SMALL_MOV_SIZE
> +       cmpl    $4, %edx
> +#else
> +       subq    $4, %rdx
> +#endif
>         jae     L(between_4_7)
> -       cmpb    $1, %dl
> -       ja      L(between_2_3)
> -       jb      1f
> -       movzbl  (%rsi), %ecx
> +       cmpl    $(1 - SMALL_SIZE_OFFSET), %edx
> +       jl      L(copy_0)
> +       movb    (%rsi), %cl
> +       je      L(copy_1)
> +       movzwl  (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> +       movw    %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> +L(copy_1):
>         movb    %cl, (%rdi)
> -1:
> +L(copy_0):
>         ret
> +
> +#if SMALL_MOV_SIZE
> +       .p2align 4,, 8
> +L(between_4_7):
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       movl    -4(%rsi, %rdx), %ecx
> +       movl    (%rsi), %esi
> +       movl    %ecx, -4(%rdi, %rdx)
> +       movl    %esi, (%rdi)
> +       ret
> +#endif
> +
> +#if VEC_SIZE > 16
> +       /* From 16 to 31.  No branch when size == 16.  */
> +       .p2align 4,, 8
> +L(between_16_31):
> +       vmovdqu (%rsi), %xmm0
> +       vmovdqu -16(%rsi, %rdx), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -16(%rdi, %rdx)
> +       /* No ymm registers have been touched.  */
> +       ret
> +#endif
> +
>  #if VEC_SIZE > 32
> +       .p2align 4,, 10
>  L(between_32_63):
>         /* From 32 to 63.  No branch when size == 32.  */
>         VMOVU   (%rsi), %YMM0
> -       VMOVU   -32(%rsi,%rdx), %YMM1
> +       VMOVU   -32(%rsi, %rdx), %YMM1
>         VMOVU   %YMM0, (%rdi)
> -       VMOVU   %YMM1, -32(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -#endif
> -#if VEC_SIZE > 16
> -       /* From 16 to 31.  No branch when size == 16.  */
> -L(between_16_31):
> -       VMOVU   (%rsi), %XMM0
> -       VMOVU   -16(%rsi,%rdx), %XMM1
> -       VMOVU   %XMM0, (%rdi)
> -       VMOVU   %XMM1, -16(%rdi,%rdx)
> +       VMOVU   %YMM1, -32(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
> +
> +       .p2align 4,, 10
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
> -       movq    -8(%rsi,%rdx), %rcx
> +       movq    -8(%rsi, %rdx), %rcx
>         movq    (%rsi), %rsi
> -       movq    %rcx, -8(%rdi,%rdx)
>         movq    %rsi, (%rdi)
> +       movq    %rcx, -8(%rdi, %rdx)
>         ret
> -L(between_4_7):
> -       /* From 4 to 7.  No branch when size == 4.  */
> -       movl    -4(%rsi,%rdx), %ecx
> -       movl    (%rsi), %esi
> -       movl    %ecx, -4(%rdi,%rdx)
> -       movl    %esi, (%rdi)
> -       ret
> -L(between_2_3):
> -       /* From 2 to 3.  No branch when size == 2.  */
> -       movzwl  -2(%rsi,%rdx), %ecx
> -       movzwl  (%rsi), %esi
> -       movw    %cx, -2(%rdi,%rdx)
> -       movw    %si, (%rdi)
> -       ret
>
> +       .p2align 4,, 10
> +L(last_4x_vec):
> +       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> +
> +       /* VEC(0) and VEC(1) have already been loaded.  */
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  L(movsb_more_2x_vec):
>         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
>         ja      L(movsb)
>  #endif
>  L(more_2x_vec):
> -       /* More than 2 * VEC and there may be overlap between destination
> -          and source.  */
> +       /* More than 2 * VEC and there may be overlap between
> +          destination and source.  */
>         cmpq    $(VEC_SIZE * 8), %rdx
>         ja      L(more_8x_vec)
> +       /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_4x_vec)
> -       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), VEC_SIZE(%rdi)
>         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -L(last_4x_vec):
> -       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> +       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 4
>  L(more_8x_vec):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward REP MOVSB is slow and we don't want to use NT stores if
> +          there is overlap.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
>         /* Check if non-temporal move candidate.  */
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>         /* Check non-temporal store threshold.  */
> -       cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
>         ja      L(large_memcpy_2x)
>  #endif
> -       /* Entry if rdx is greater than non-temporal threshold but there
> -       is overlap.  */
> +       /* To reach this point there cannot be overlap and dst > src. So
> +          check for overlap and src > dst in which case correctness
> +          requires forward copy. Otherwise decide between backward/forward
> +          copy depending on address aliasing.  */
> +
> +       /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> +          but less than __x86_shared_non_temporal_threshold.  */
>  L(more_8x_vec_check):
> -       cmpq    %rsi, %rdi
> -       ja      L(more_8x_vec_backward)
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       /* Load the first VEC and last 4 * VEC to support overlapping
> -          addresses.  */
> -       VMOVU   (%rsi), %VEC(4)
> +       /* rcx contains dst - src. Add back length (rdx).  */
> +       leaq    (%rcx, %rdx), %r8
> +       /* If r8 has different sign than rcx then there is overlap so we
> +          must do forward copy.  */
> +       xorq    %rcx, %r8
> +       /* Isolate just sign bit of r8.  */
> +       shrq    $63, %r8
> +       /* Get 4k difference dst - src.  */
> +       andl    $(PAGE_SIZE - 256), %ecx
> +       /* If r8 is non-zero must do foward for correctness. Otherwise
> +          if ecx is non-zero there is 4k False Alaising so do backward
> +          copy.  */
> +       addl    %r8d, %ecx
> +       jz      L(more_8x_vec_backward)
> +
> +       /* if rdx is greater than __x86_shared_non_temporal_threshold
> +          but there is overlap, or from short distance movsb.  */
> +L(more_8x_vec_forward):
> +       /* Load first and last 4 * VEC to support overlapping addresses.
> +        */
> +
> +       /* First vec was already loaded into VEC(0).  */
>         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
>         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> +       /* Save begining of dst.  */
> +       movq    %rdi, %rcx
> +       /* Align dst to VEC_SIZE - 1.  */
> +       orq     $(VEC_SIZE - 1), %rdi
>         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
>         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> -       /* Save start and stop of the destination buffer.  */
> -       movq    %rdi, %r11
> -       leaq    -VEC_SIZE(%rdi, %rdx), %rcx
> -       /* Align destination for aligned stores in the loop.  Compute
> -          how much destination is misaligned.  */
> -       movq    %rdi, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Get the negative of offset for alignment.  */
> -       subq    $VEC_SIZE, %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rsi
> -       /* Adjust destination which should be aligned now.  */
> -       subq    %r8, %rdi
> -       /* Adjust length.  */
> -       addq    %r8, %rdx
>
> -       .p2align 4
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rcx, %rsi
> +       /* Finish aligning dst.  */
> +       incq    %rdi
> +       /* Restore src adjusted with new value for aligned dst.  */
> +       addq    %rdi, %rsi
> +       /* Store end of buffer minus tail in rdx.  */
> +       leaq    (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> +
> +       /* Dont use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
>         subq    $-(VEC_SIZE * 4), %rsi
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VEC(1), (%rdi)
> +       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> +       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
> -       cmpq    $(VEC_SIZE * 4), %rdx
> +       cmpq    %rdi, %rdx
>         ja      L(loop_4x_vec_forward)
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (%rcx)
> -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> +       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> +       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> +       VMOVU   %VEC(8), (%rdx)
>         /* Store the first VEC.  */
> -       VMOVU   %VEC(4), (%r11)
> +       VMOVU   %VEC(0), (%rcx)
> +       /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> +        */
> +L(nop_backward):
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 8
> +L(more_8x_vec_backward_check_nop):
> +       /* rcx contains dst - src. Test for dst == src to skip all of
> +          memmove.  */
> +       testq   %rcx, %rcx
> +       jz      L(nop_backward)
>  L(more_8x_vec_backward):
>         /* Load the first 4 * VEC and last VEC to support overlapping
>            addresses.  */
> -       VMOVU   (%rsi), %VEC(4)
> +
> +       /* First vec was also loaded into VEC(0).  */
>         VMOVU   VEC_SIZE(%rsi), %VEC(5)
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> +       /* Begining of region for 4x backward copy stored in rcx.  */
> +       leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
> -       /* Save stop of the destination buffer.  */
> -       leaq    -VEC_SIZE(%rdi, %rdx), %r11
> -       /* Align destination end for aligned stores in the loop.  Compute
> -          how much destination end is misaligned.  */
> -       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
> -       movq    %r11, %r9
> -       movq    %r11, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rcx
> -       /* Adjust the end of destination which should be aligned now.  */
> -       subq    %r8, %r9
> -       /* Adjust length.  */
> -       subq    %r8, %rdx
> -
> -       .p2align 4
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Align dst.  */
> +       andq    $-(VEC_SIZE), %rcx
> +       /* Restore src.  */
> +       addq    %rcx, %rsi
> +
> +       /* Don't use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
> -       VMOVU   (%rcx), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%r9)
> -       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> -       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> -       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> -       addq    $-(VEC_SIZE * 4), %r9
> -       cmpq    $(VEC_SIZE * 4), %rdx
> -       ja      L(loop_4x_vec_backward)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> +       addq    $(VEC_SIZE * -4), %rsi
> +       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> +       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> +       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> +       addq    $(VEC_SIZE * -4), %rcx
> +       cmpq    %rcx, %rdi
> +       jb      L(loop_4x_vec_backward)
>         /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(4), (%rdi)
> +       VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(5), VEC_SIZE(%rdi)
>         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
>         /* Store the last VEC.  */
> -       VMOVU   %VEC(8), (%r11)
> +       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> +       VZEROUPPER_RETURN
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> +       /* L(skip_short_movsb_check) is only used with ERMS. Not for
> +          FSRM.  */
> +       .p2align 5,, 16
> +# if ALIGN_MOVSB
> +L(skip_short_movsb_check):
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* If CPU does not have FSRM two options for aligning. Align src
> +          if dst and src 4k alias. Otherwise align dst.  */
> +       testl   $(PAGE_SIZE - 512), %ecx
> +       jnz     L(movsb_align_dst)
> +       /* Fall through. dst and src 4k alias. It's better to align src
> +          here because the bottleneck will be loads dues to the false
> +          dependency on dst.  */
> +
> +       /* rcx already has dst - src.  */
> +       movq    %rcx, %r9
> +       /* Add src to len. Subtract back after src aligned. -1 because
> +          src is initially aligned to MOVSB_ALIGN_TO - 1.  */
> +       leaq    -1(%rsi, %rdx), %rcx
> +       /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> +       orq     $(MOVSB_ALIGN_TO - 1), %rsi
> +       /* Restore dst and len adjusted with new values for aligned dst.
> +        */
> +       leaq    1(%rsi, %r9), %rdi
> +       subq    %rsi, %rcx
> +       /* Finish aligning src.  */
> +       incq    %rsi
> +
> +       rep     movsb
> +
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
>         VZEROUPPER_RETURN
> +# endif
> +
> +       .p2align 4,, 12
> +L(movsb):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward REP MOVSB is slow and we don't want to use NT stores if
> +          there is overlap.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
> +# if ALIGN_MOVSB
> +       /* Save dest for storing aligning VECs later.  */
> +       movq    %rdi, %r8
> +# endif
> +       /* If above __x86_rep_movsb_stop_threshold most likely is
> +          candidate for NT moves aswell.  */
> +       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> +       jae     L(large_memcpy_2x_check)
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> +       /* Only avoid short movsb if CPU has FSRM.  */
> +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +       jz      L(skip_short_movsb_check)
> +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       /* Avoid "rep movsb" if RCX, the distance between source and
> +          destination, is N*4GB + [1..63] with N >= 0.  */
> +
> +       /* ecx contains dst - src. Early check for backward copy
> +          conditions means only case of slow movsb with src = dst + [0,
> +          63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> +          for that case.  */
> +       cmpl    $-64, %ecx
> +       ja      L(more_8x_vec_forward)
> +#  endif
> +# endif
> +# if ALIGN_MOVSB
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* Fall through means cpu has FSRM. In that case exclusively
> +          align destination.  */
> +L(movsb_align_dst):
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> +       addq    $(MOVSB_ALIGN_TO - 1), %rdi
> +       /* Add dst to len. Subtract back after dst aligned.  */
> +       leaq    (%r8, %rdx), %rcx
> +       /* Finish aligning dst.  */
> +       andq    $-(MOVSB_ALIGN_TO), %rdi
> +       /* Restore src and len adjusted with new values for aligned dst.
> +        */
> +       addq    %rdi, %rsi
> +       subq    %rdi, %rcx
> +
> +       rep     movsb
> +
> +       /* Store VECs loaded for aligning.  */
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
> +       VZEROUPPER_RETURN
> +# else /* !ALIGN_MOVSB.  */
> +L(skip_short_movsb_check):
> +       mov     %RDX_LP, %RCX_LP
> +       rep     movsb
> +       ret
> +# endif
> +#endif
>
> +       .p2align 4,, 10
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -       .p2align 4
> +L(large_memcpy_2x_check):
> +       cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> +       jb      L(more_8x_vec_check)
>  L(large_memcpy_2x):
> -       /* Compute absolute value of difference between source and
> -          destination.  */
> -       movq    %rdi, %r9
> -       subq    %rsi, %r9
> -       movq    %r9, %r8
> -       leaq    -1(%r9), %rcx
> -       sarq    $63, %r8
> -       xorq    %r8, %r9
> -       subq    %r8, %r9
> -       /* Don't use non-temporal store if there is overlap between
> -          destination and source since destination may be in cache when
> -          source is loaded.  */
> -       cmpq    %r9, %rdx
> -       ja      L(more_8x_vec_check)
> +       /* To reach this point it is impossible for dst > src and
> +          overlap. Remaining to check is src > dst and overlap. rcx
> +          already contains dst - src. Negate rcx to get src - dst. If
> +          length > rcx then there is overlap and forward copy is best.  */
> +       negq    %rcx
> +       cmpq    %rcx, %rdx
> +       ja      L(more_8x_vec_forward)
>
>         /* Cache align destination. First store the first 64 bytes then
>            adjust alignments.  */
> -       VMOVU   (%rsi), %VEC(8)
> -#if VEC_SIZE < 64
> -       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> -#if VEC_SIZE < 32
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> -#endif
> -#endif
> -       VMOVU   %VEC(8), (%rdi)
> -#if VEC_SIZE < 64
> -       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> -#if VEC_SIZE < 32
> -       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> -#endif
> -#endif
> +
> +       /* First vec was also loaded into VEC(0).  */
> +# if VEC_SIZE < 64
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  if VEC_SIZE < 32
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +#  endif
> +# endif
> +       VMOVU   %VEC(0), (%rdi)
> +# if VEC_SIZE < 64
> +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +#  if VEC_SIZE < 32
> +       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +#  endif
> +# endif
> +
>         /* Adjust source, destination, and size.  */
>         movq    %rdi, %r8
>         andq    $63, %r8
> @@ -614,9 +767,13 @@ L(large_memcpy_2x):
>         /* Adjust length.  */
>         addq    %r8, %rdx
>
> -       /* Test if source and destination addresses will alias. If they do
> -          the larger pipeline in large_memcpy_4x alleviated the
> +       /* Test if source and destination addresses will alias. If they
> +          do the larger pipeline in large_memcpy_4x alleviated the
>            performance drop.  */
> +
> +       /* ecx contains -(dst - src). not ecx will return dst - src - 1
> +          which works for testing aliasing.  */
> +       notl    %ecx
>         testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
>         jz      L(large_memcpy_4x)
>
> @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
>         /* ecx stores inner loop counter.  */
>         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
>  L(loop_large_memcpy_4x_inner):
> -       /* Only one prefetch set per page as doing 4 pages give more time
> -          for prefetcher to keep up.  */
> +       /* Only one prefetch set per page as doing 4 pages give more
> +          time for prefetcher to keep up.  */
>         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> --
> 2.25.1
>

[-- Attachment #2: tgl.pdf --]
[-- Type: application/pdf, Size: 877538 bytes --]

[-- Attachment #3: skl.pdf --]
[-- Type: application/pdf, Size: 878621 bytes --]

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                   ` (3 preceding siblings ...)
  2021-11-01  5:49 ` [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
@ 2021-11-06  2:27 ` H.J. Lu
  2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-06  2:27 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

On Mon, Nov 01, 2021 at 12:49:48AM -0500, Noah Goldstein wrote:
> This commit updates the memcpy tests to test both dst > src and dst <
> src. This is because there is logic in the code based on the
> ---
>  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
>  string/test-memmove.c |  75 ++++++++++++++++++-
>  2 files changed, 214 insertions(+), 28 deletions(-)
> 
> diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> index c9e965bed3..3b0f3127b7 100644
> --- a/string/test-memcpy.c
> +++ b/string/test-memcpy.c
> @@ -17,6 +17,7 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #ifndef MEMCPY_RESULT
> +# define DO_EXTRA_TESTS
>  # define MEMCPY_RESULT(dst, len) dst
>  # define MIN_PAGE_SIZE 131072
>  # define TEST_MAIN
> @@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
>  static void
>  do_test (size_t align1, size_t align2, size_t len)
>  {
> -  size_t i, j;
> +  size_t i, j, repeats;
>    char *s1, *s2;
>  
>    align1 &= 4095;
> @@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
>  
>    s1 = (char *) (buf1 + align1);
>    s2 = (char *) (buf2 + align2);
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
> +      for (i = 0, j = 1; i < len; i++, j += 23)
> +        s1[i] = j;
>  
> -  for (i = 0, j = 1; i < len; i++, j += 23)
> -    s1[i] = j;
> -
> -  FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len);
> +      FOR_EACH_IMPL (impl, 0)
> +        do_one_test (impl, s2, s1, len);
> +    }
>  }
>  
>  static void
> @@ -212,56 +215,87 @@ do_random_tests (void)
>  }
>  
>  static void
> -do_test1 (size_t size)
> +do_test1 (size_t align1, size_t align2, size_t size)
>  {
>    void *large_buf;
> -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> -		    MAP_PRIVATE | MAP_ANON, -1, 0);
> +  size_t mmap_size, region_size;
> +
> +  align1 &= (page_size - 1);
> +  if (align1 == 0)
> +    align1 = page_size;
> +
> +  align2 &= (page_size - 1);
> +  if (align2 == 0)
> +    align2 = page_size;
> +
> +  region_size = (size + page_size - 1) & (~(page_size - 1));
> +
> +  mmap_size = region_size * 2 + 3 * page_size;
> +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANON, -1, 0);
>    if (large_buf == MAP_FAILED)
>      {
> -      puts ("Failed to allocat large_buf, skipping do_test1");
> +      puts ("Failed to allocate large_buf, skipping do_test1");
>        return;
>      }
> -
> -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
>      error (EXIT_FAILURE, errno, "mprotect failed");
>  
> -  size_t arrary_size = size / sizeof (uint32_t);
> -  uint32_t *dest = large_buf;
> -  uint32_t *src = large_buf + size + page_size;
> +  size_t array_size = size / sizeof (uint32_t);
> +  uint32_t *dest = large_buf + align1;
> +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
>    size_t i;
>    size_t repeats;
>    for(repeats = 0; repeats < 2; repeats++)
>      {
> -      for (i = 0; i < arrary_size; i++)
> +      for (i = 0; i < array_size; i++)
>          src[i] = (uint32_t) i;
> -
>        FOR_EACH_IMPL (impl, 0)
>          {
> -            printf ("\t\tRunning: %s\n", impl->name);
>            memset (dest, -1, size);
>            CALL (impl, (char *) dest, (char *) src, size);
> -          for (i = 0; i < arrary_size; i++)
> +          for (i = 0; i < array_size; i++)
>          if (dest[i] != src[i])
>            {
>              error (0, 0,
>                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
>                 impl->name, dest, src, i);
>              ret = 1;
> -            munmap ((void *) large_buf, size * 2 + page_size);
> +            munmap ((void *) large_buf, mmap_size);
>              return;
>            }
>          }
> -      dest = src;
> -      src = large_buf;
> +      dest = large_buf + region_size + 2 * page_size + align1;
> +      src = large_buf + align2;
> +    }
> +  munmap ((void *) large_buf, mmap_size);
> +}
> +
> +static void
> +do_random_large_tests (void)
> +{
> +  size_t i, align1, align2, size;
> +  for (i = 0; i < 32; ++i)
> +    {
> +      align1 = random ();
> +      align2 = random ();
> +      size = (random() % 0x1000000) + 0x200000;
> +      do_test1 (align1, align2, size);
> +    }
> +
> +  for (i = 0; i < 128; ++i)
> +    {
> +      align1 = random ();
> +      align2 = random ();
> +      size = (random() % 32768) + 4096;
> +      do_test1 (align1, align2, size);
>      }
> -  munmap ((void *) large_buf, size * 2 + page_size);
>  }
>  
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  size_t i, j;
>  
>    test_init ();
>  
> @@ -298,6 +332,7 @@ test_main (void)
>    for (i = 19; i <= 25; ++i)
>      {
>        do_test (255, 0, 1 << i);
> +      do_test (0, 4000, 1 << i);
>        do_test (0, 255, i);
>        do_test (0, 4000, i);
>      }
> @@ -306,8 +341,88 @@ test_main (void)
>  
>    do_random_tests ();
>  
> -  do_test1 (0x100000);
> -  do_test1 (0x2000000);
> +  do_test1 (0, 0, 0x100000);
> +  do_test1 (0, 0, 0x2000000);
> +
> +  for (i = 4096; i < 32768; i += 4096)
> +    {
> +      for (j = 1; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +        }
> +    }
> +
> +  for (i = 0x300000; i < 0x2000000; i += 0x235689)
> +    {
> +      for (j = 64; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +        }
> +    }
> +#ifdef DO_EXTRA_TESTS
> +  for (i = 0x200000; i <= 0x2000000; i += i)
> +    {
> +      for (j = 64; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +
> +          do_test1 (0, j, i + 1);
> +          do_test1 (4095, j, i + 1);
> +          do_test1 (4096 - j, 0, i + 1);
> +
> +          do_test1 (0, j - 1, i + 1);
> +          do_test1 (4095, j - 1, i + 1);
> +          do_test1 (4096 - j - 1, 0, i + 1);
> +
> +          do_test1 (0, j + 1, i + 1);
> +          do_test1 (4095, j + 1, i + 1);
> +          do_test1 (4096 - j, 1, i + 1);
> +
> +          do_test1 (0, j, i - 1);
> +          do_test1 (4095, j, i - 1);
> +          do_test1 (4096 - j, 0, i - 1);
> +
> +          do_test1 (0, j - 1, i - 1);
> +          do_test1 (4095, j - 1, i - 1);
> +          do_test1 (4096 - j - 1, 0, i - 1);
> +
> +          do_test1 (0, j + 1, i - 1);
> +          do_test1 (4095, j + 1, i - 1);
> +          do_test1 (4096 - j, 1, i - 1);
> +        }
> +    }
> +#endif
> +  do_random_large_tests ();
>    return ret;
>  }
>  
> diff --git a/string/test-memmove.c b/string/test-memmove.c
> index a0ce8b0334..5c6d1579e3 100644
> --- a/string/test-memmove.c
> +++ b/string/test-memmove.c
> @@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>  
> -  align1 &= 63;
> +  align1 &= (getpagesize() - 1);
>    if (align1 + len >= page_size)
>      return;
>  
> -  align2 &= 63;
> +  align2 &= (getpagesize() - 1);
>    if (align2 + len >= page_size)
>      return;
>  
> @@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
>    munmap ((void *) buf, size);
>  }
>  
> +static void
> +do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
> +{
> +  size_t size, repeats, i;
> +  uint8_t *buf, *dst, *src;
> +
> +  size = bytes_move + MAX(offset1, offset2);
> +  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +             MAP_PRIVATE | MAP_ANON, -1, 0);
> +
> +  if (buf == MAP_FAILED)
> +    error (EXIT_UNSUPPORTED, errno, "mmap failed");
> +
> +  dst = &buf[offset1];
> +  src = &buf[offset2];
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
> +      FOR_EACH_IMPL (impl, 0)
> +        {
> +          for (i = 0; i < bytes_move; i++)
> +              src[i] = (uint8_t) i;
> +#ifdef TEST_BCOPY
> +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> +#else
> +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> +#endif
> +          for (i = 0; i < bytes_move; i++)
> +            {
> +              if (dst[i] != (uint8_t) i)
> +                {
> +                  error (0, 0,
> +                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> +                         impl->name, dst, buf, i);
> +                  ret = 1;
> +                  break;
> +                }
> +            }
> +        }
> +      dst = &buf[offset2];
> +      src = &buf[offset1];
> +    }
> +  munmap ((void *) buf, size);
> +}
> +
> +
>  int
>  test_main (void)
>  {
> @@ -395,13 +440,39 @@ test_main (void)
>  
>    do_random_tests ();
>  
> +  do_test2 (0);
>    do_test2 (33);
> +  do_test2 (0x200000 - 1);
>    do_test2 (0x200000);
> +  do_test2 (0x200000 + 1);
> +  do_test2 (0x1000000 - 1);
> +  do_test2 (0x1000000);
> +  do_test2 (0x1000000 + 1);
>    do_test2 (0x4000000 - 1);
>    do_test2 (0x4000000);
> +  do_test2 (0x4000000 + 1);
>  
>    /* Copy 16KB data.  */
>    do_test3 (16384, 3);
> +  for (i = 4096; i <= 16384; i <<= 1)
> +    {
> +      do_test4 (i, 0, i);
> +      do_test4 (i, 0, i - 1);
> +      do_test4 (i, 0, i + 1);
> +      do_test4 (i, 63, i + 63);
> +      do_test4 (i, 63, i + 64);
> +      do_test4 (i, 63, i);
> +
> +      do_test4 (i, 0, 1);
> +      do_test4 (i, 0, 15);
> +      do_test4 (i, 0, 31);
> +      do_test4 (i, 0, 63);
> +      do_test4 (i, 0, 64);
> +      do_test4 (i, 0, 65);
> +      do_test4 (i, 0, 127);
> +      do_test4 (i, 0, 129);
> +    }
> +
>  
>    return ret;
>  }
> -- 
> 2.25.1
> 

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-11-01  5:49 ` [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
@ 2021-11-06  2:27   ` H.J. Lu
  0 siblings, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-06  2:27 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Nov 01, 2021 at 12:49:49AM -0500, Noah Goldstein wrote:
> This commit adds more benchmarks for the common memcpy/memmove
> benchmarks. The most signifcant cases are the half page offsets. The
> current versions leaves dst and src near page aligned which leads to
> false 4k aliasing on x86_64. This can add noise due to false
> dependencies from one run to the next. As well, this seems like more
> of an edge case that common case so it shouldn't be the only thing
> ---
>  benchtests/bench-memcpy.c  | 49 +++++++++++++++++++++++++++++++++-----
>  benchtests/bench-memmove.c | 26 +++++++++++++++++---
>  2 files changed, 66 insertions(+), 9 deletions(-)
> 
> diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
> index d9236a2282..744bea26d3 100644
> --- a/benchtests/bench-memcpy.c
> +++ b/benchtests/bench-memcpy.c
> @@ -40,7 +40,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, const char *src,
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> -
> +  for (i = 0; i < iters / 64; ++i)
> +    {
> +      CALL (impl, dst, src, len);
> +    }
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> @@ -60,11 +63,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
>    size_t i, j;
>    char *s1, *s2;
>    size_t repeats;
> -  align1 &= 63;
> +  align1 &= (getpagesize () - 1);
>    if (align1 + len >= page_size)
>      return;
>  
> -  align2 &= 63;
> +  align2 &= (getpagesize () - 1);
>    if (align2 + len >= page_size)
>      return;
>  
> @@ -99,7 +102,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> -
> +  size_t half_page = getpagesize () / 2;
>    test_init ();
>  
>    json_init (&json_ctx, 0, stdout);
> @@ -121,8 +124,15 @@ test_main (void)
>      {
>        do_test (&json_ctx, 0, 0, 1 << i, 1);
>        do_test (&json_ctx, i, 0, 1 << i, 1);
> +      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
>        do_test (&json_ctx, 0, i, 1 << i, 1);
> +      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
>        do_test (&json_ctx, i, i, 1 << i, 1);
> +      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
> +      do_test (&json_ctx, half_page, 0, 1 << i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
> +      do_test (&json_ctx, half_page, i, 1 << i, 1);
> +      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
>      }
>  
>    for (i = 0; i < 32; ++i)
> @@ -131,16 +141,26 @@ test_main (void)
>        do_test (&json_ctx, i, 0, i, 0);
>        do_test (&json_ctx, 0, i, i, 0);
>        do_test (&json_ctx, i, i, i, 0);
> +      do_test (&json_ctx, half_page, 0, i, 0);
> +      do_test (&json_ctx, half_page + i, 0, i, 0);
> +      do_test (&json_ctx, half_page, i, i, 0);
> +      do_test (&json_ctx, half_page + i, i, i, 0);
> +      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
> +      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
>      }
>  
>    for (i = 3; i < 32; ++i)
>      {
>        if ((i & (i - 1)) == 0)
> -	continue;
> +        continue;
>        do_test (&json_ctx, 0, 0, 16 * i, 1);
>        do_test (&json_ctx, i, 0, 16 * i, 1);
>        do_test (&json_ctx, 0, i, 16 * i, 1);
>        do_test (&json_ctx, i, i, 16 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 16 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
> +      do_test (&json_ctx, half_page, i, 16 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
>      }
>  
>    for (i = 32; i < 64; ++i)
> @@ -149,16 +169,33 @@ test_main (void)
>        do_test (&json_ctx, i, 0, 32 * i, 1);
>        do_test (&json_ctx, 0, i, 32 * i, 1);
>        do_test (&json_ctx, i, i, 32 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 32 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
> +      do_test (&json_ctx, half_page, i, 32 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
>      }
>  
>    do_test (&json_ctx, 0, 0, getpagesize (), 1);
>  
> -  for (i = 0; i <= 32; ++i)
> +  for (i = 0; i <= 48; ++i)
>      {
>        do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
>        do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
>        do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
>        do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
>      }
>  
>    json_array_end (&json_ctx);
> diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
> index 6becbf4782..855f4d0649 100644
> --- a/benchtests/bench-memmove.c
> +++ b/benchtests/bench-memmove.c
> @@ -34,7 +34,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> -
> +  for (i = 0; i < iters / 64; ++i)
> +    {
> +      CALL (impl, dst, src, len);
> +    }
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> @@ -53,11 +56,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>  
> -  align1 &= 63;
> +  align1 &= (getpagesize () - 1);
>    if (align1 + len >= page_size)
>      return;
>  
> -  align2 &= 63;
> +  align2 &= (getpagesize () - 1);
>    if (align2 + len >= page_size)
>      return;
>  
> @@ -85,6 +88,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> +  size_t half_page = getpagesize () / 2;
>  
>    test_init ();
>  
> @@ -138,6 +142,22 @@ test_main (void)
>        do_test (&json_ctx, i, i, 32 * i);
>      }
>  
> +  for (i = 0; i <= 48; ++i)
> +    {
> +      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, i, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, i, 2048 + 64 * i);
> +      do_test (&json_ctx, i, i, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
> +      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
> +    }
> +
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> -- 
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-11-01  5:49 ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
@ 2021-11-06  2:28   ` H.J. Lu
  0 siblings, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-06  2:28 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Nov 01, 2021 at 12:49:50AM -0500, Noah Goldstein wrote:
> This commit adds a new partial overlap benchmark. This is generally
> the most interesting performance case for memmove and was missing.
> ---
>  benchtests/bench-memmove-walk.c | 61 +++++++++++++++++++++++++--------
>  1 file changed, 46 insertions(+), 15 deletions(-)
> 
> diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
> index b5fdb2a422..2fb484c0ba 100644
> --- a/benchtests/bench-memmove-walk.c
> +++ b/benchtests/bench-memmove-walk.c
> @@ -36,6 +36,10 @@
>  # define TIMEOUT (20 * 60)
>  # include "bench-string.h"
>  
> +#define NO_OVERLAP 0
> +#define PARTIAL_OVERLAP 1
> +#define COMPLETE_OVERLAP 2
> +
>  IMPL (memmove, 1)
>  #endif
>  
> @@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  }
>  
>  static void
> -do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
> +do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
>  {
> -  json_element_object_begin (json_ctx);
> -  json_attr_uint (json_ctx, "length", (double) len);
> -  json_array_begin (json_ctx, "timings");
> +  char *s1, *s2, *tmp;
> +  size_t repeats;
>  
> -  if (overlap)
> -    buf2 = buf1;
> +  s1 = (char *) (buf1);
> +  s2 = (char *) (buf2);
> +  if (overlap != NO_OVERLAP)
> +    s2 = s1;
> +  if (overlap == PARTIAL_OVERLAP)
> +    s2 += len / 2;
>  
> -  FOR_EACH_IMPL (impl, 0)
> -    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
> +  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
> +    {
> +      json_element_object_begin (json_ctx);
> +      json_attr_uint (json_ctx, "length", (double) len);
> +      json_attr_string(json_ctx, "overlap",
> +                       overlap == NO_OVERLAP        ? "none"
> +                       : overlap == PARTIAL_OVERLAP ? "partial"
> +                                                    : "complete");
> +      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
> +      json_array_begin (json_ctx, "timings");
> +
> +
> +      FOR_EACH_IMPL (impl, 0)
> +        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
>  
> -  json_array_end (json_ctx);
> -  json_element_object_end (json_ctx);
> +      json_array_end (json_ctx);
> +      json_element_object_end (json_ctx);
> +
> +      tmp = s1;
> +      s1 = s2;
> +      s2 = tmp;
> +    }
>  }
>  
>  int
> @@ -107,15 +131,22 @@ test_main (void)
>    /* Non-overlapping buffers.  */
>    for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
>      {
> -      do_test (&json_ctx, i, false);
> -      do_test (&json_ctx, i + 1, false);
> +      do_test (&json_ctx, i, NO_OVERLAP, 1);
> +      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
> +    }
> +
> +  /* Partially-overlapping buffers.  */
> +  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
> +    {
> +      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
> +      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
>      }
>  
> -  /* Overlapping buffers.  */
> +  /* Complete-overlapping buffers.  */
>    for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
>      {
> -      do_test (&json_ctx, i, true);
> -      do_test (&json_ctx, i + 1, true);
> +      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
> +      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
>      }
>  
>    json_array_end (&json_ctx);
> -- 
> 2.25.1
> 

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-01  5:49 ` [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
  2021-11-01  5:52   ` Noah Goldstein
@ 2021-11-06  2:29   ` H.J. Lu
  1 sibling, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-06  2:29 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Nov 01, 2021 at 12:49:51AM -0500, Noah Goldstein wrote:
> No bug.
> 
> The optimizations are as follows:
> 
> 1) Always align entry to 64 bytes. This makes behavior more
>    predictable and makes other frontend optimizations easier.
> 
> 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
>    significant benefits in the case that:
>         0 < (dst - src) < [256, 512]
> 
> 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
>    improvement and for FSRM [-10%, 25%].
> 
> In addition to these primary changes there is general cleanup
> throughout to optimize the aligning routines and control flow logic.
> ---
>  sysdeps/x86_64/memmove.S                      |   2 +-
>  .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
>  .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
>  .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
>  .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
>  6 files changed, 381 insertions(+), 224 deletions(-)
> 
> diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> index db106a7a1f..b2b3180848 100644
> --- a/sysdeps/x86_64/memmove.S
> +++ b/sysdeps/x86_64/memmove.S
> @@ -25,7 +25,7 @@
>  /* Use movups and movaps for smaller code sizes.  */
>  #define VMOVU		movups
>  #define VMOVA		movaps
> -
> +#define MOV_SIZE	3
>  #define SECTION(p)		p
>  
>  #ifdef USE_MULTIARCH
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 1ec1962e86..67a55f0c85 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT		vmovntdq
>  # define VMOVU		vmovdqu
>  # define VMOVA		vmovdqa
> -
> +# define MOV_SIZE	4
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>  
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index e195e93f15..975ae6c051 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT		vmovntdq
>  # define VMOVU		vmovdqu
>  # define VMOVA		vmovdqa
> -
> +# define MOV_SIZE	4
>  # define SECTION(p)		p##.avx
>  # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
>  
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 848848ab39..0fa7126830 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU		vmovdqu64
>  # define VMOVA		vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE	6
>  # define SECTION(p)		p##.evex512
>  # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
>  
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 0cbce8f944..88715441fe 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU		vmovdqu64
>  # define VMOVA		vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE	6
>  # define SECTION(p)		p##.evex
>  # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
>  
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index abde8438d4..7b27cbdda5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -76,6 +76,25 @@
>  # endif
>  #endif
>  
> +/* Whether to align before movsb. Ultimately we want 64 byte
> +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> +#define ALIGN_MOVSB	(VEC_SIZE > 16)
> +/* Number of bytes to align movsb to.  */
> +#define MOVSB_ALIGN_TO	64
> +
> +#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
> +#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
> +
> +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> +# error MOV_SIZE Unknown
> +#endif
> +
> +#if LARGE_MOV_SIZE
> +# define SMALL_SIZE_OFFSET	(4)
> +#else
> +# define SMALL_SIZE_OFFSET	(0)
> +#endif
> +
>  #ifndef PAGE_SIZE
>  # define PAGE_SIZE 4096
>  #endif
> @@ -199,25 +218,21 @@ L(start):
>  # endif
>  	cmp	$VEC_SIZE, %RDX_LP
>  	jb	L(less_vec)
> +	/* Load regardless.  */
> +	VMOVU	(%rsi), %VEC(0)
>  	cmp	$(VEC_SIZE * 2), %RDX_LP
>  	ja	L(more_2x_vec)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(last_2x_vec):
> -#endif
>  	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -	VMOVU	(%rsi), %VEC(0)
>  	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
>  	VMOVU	%VEC(0), (%rdi)
>  	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> -	ret
> +#if !(defined USE_MULTIARCH && IS_IN (libc))
> +	ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
>  	VZEROUPPER_RETURN
>  #endif
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMMOVE_SYMBOL (__memmove, unaligned))
> -
>  # if VEC_SIZE == 16
>  ENTRY (__mempcpy_chk_erms)
>  	cmp	%RDX_LP, %RCX_LP
> @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  # endif
>  
> -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
>  	movq	%rdi, %rax
>  L(start_erms):
>  # ifdef __ILP32__
> @@ -298,310 +313,448 @@ L(start_erms):
>  # endif
>  	cmp	$VEC_SIZE, %RDX_LP
>  	jb	L(less_vec)
> +	/* Load regardless.  */
> +	VMOVU	(%rsi), %VEC(0)
>  	cmp	$(VEC_SIZE * 2), %RDX_LP
>  	ja	L(movsb_more_2x_vec)
> -L(last_2x_vec):
> -	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> -	VMOVU	(%rsi), %VEC(0)
> -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> +	 */
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
>  	VMOVU	%VEC(0), (%rdi)
> -	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
> +	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
>  L(return):
> -#if VEC_SIZE > 16
> +# if VEC_SIZE > 16
>  	ZERO_UPPER_VEC_REGISTERS_RETURN
> -#else
> +# else
>  	ret
> +# endif
>  #endif
>  
> -L(movsb):
> -	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> -	jae	L(more_8x_vec)
> -	cmpq	%rsi, %rdi
> -	jb	1f
> -	/* Source == destination is less common.  */
> -	je	L(nop)
> -	leaq	(%rsi,%rdx), %r9
> -	cmpq	%r9, %rdi
> -	/* Avoid slow backward REP MOVSB.  */
> -	jb	L(more_8x_vec_backward)
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -	jz	3f
> -	movq	%rdi, %rcx
> -	subq	%rsi, %rcx
> -	jmp	2f
> -# endif
> -1:
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -	jz	3f
> -	movq	%rsi, %rcx
> -	subq	%rdi, %rcx
> -2:
> -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> -   is N*4GB + [1..63] with N >= 0.  */
> -	cmpl	$63, %ecx
> -	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
> -3:
> -# endif
> -	mov	%RDX_LP, %RCX_LP
> -	rep movsb
> -L(nop):
> +#if LARGE_MOV_SIZE
> +	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> +	   ENTRY block and L(less_vec).  */
> +	.p2align 4,, 8
> +L(between_4_7):
> +	/* From 4 to 7.  No branch when size == 4.  */
> +	movl	(%rsi), %ecx
> +	movl	(%rsi, %rdx), %esi
> +	movl	%ecx, (%rdi)
> +	movl	%esi, (%rdi, %rdx)
>  	ret
>  #endif
>  
> +	.p2align 4
>  L(less_vec):
>  	/* Less than 1 VEC.  */
>  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  # error Unsupported VEC_SIZE!
>  #endif
>  #if VEC_SIZE > 32
> -	cmpb	$32, %dl
> +	cmpl	$32, %edx
>  	jae	L(between_32_63)
>  #endif
>  #if VEC_SIZE > 16
> -	cmpb	$16, %dl
> +	cmpl	$16, %edx
>  	jae	L(between_16_31)
>  #endif
> -	cmpb	$8, %dl
> +	cmpl	$8, %edx
>  	jae	L(between_8_15)
> -	cmpb	$4, %dl
> +#if SMALL_MOV_SIZE
> +	cmpl	$4, %edx
> +#else
> +	subq	$4, %rdx
> +#endif
>  	jae	L(between_4_7)
> -	cmpb	$1, %dl
> -	ja	L(between_2_3)
> -	jb	1f
> -	movzbl	(%rsi), %ecx
> +	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
> +	jl	L(copy_0)
> +	movb	(%rsi), %cl
> +	je	L(copy_1)
> +	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> +	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> +L(copy_1):
>  	movb	%cl, (%rdi)
> -1:
> +L(copy_0):
>  	ret
> +
> +#if SMALL_MOV_SIZE
> +	.p2align 4,, 8
> +L(between_4_7):
> +	/* From 4 to 7.  No branch when size == 4.  */
> +	movl	-4(%rsi, %rdx), %ecx
> +	movl	(%rsi), %esi
> +	movl	%ecx, -4(%rdi, %rdx)
> +	movl	%esi, (%rdi)
> +	ret
> +#endif
> +
> +#if VEC_SIZE > 16
> +	/* From 16 to 31.  No branch when size == 16.  */
> +	.p2align 4,, 8
> +L(between_16_31):
> +	vmovdqu	(%rsi), %xmm0
> +	vmovdqu	-16(%rsi, %rdx), %xmm1
> +	vmovdqu	%xmm0, (%rdi)
> +	vmovdqu	%xmm1, -16(%rdi, %rdx)
> +	/* No ymm registers have been touched.  */
> +	ret
> +#endif
> +
>  #if VEC_SIZE > 32
> +	.p2align 4,, 10
>  L(between_32_63):
>  	/* From 32 to 63.  No branch when size == 32.  */
>  	VMOVU	(%rsi), %YMM0
> -	VMOVU	-32(%rsi,%rdx), %YMM1
> +	VMOVU	-32(%rsi, %rdx), %YMM1
>  	VMOVU	%YMM0, (%rdi)
> -	VMOVU	%YMM1, -32(%rdi,%rdx)
> -	VZEROUPPER_RETURN
> -#endif
> -#if VEC_SIZE > 16
> -	/* From 16 to 31.  No branch when size == 16.  */
> -L(between_16_31):
> -	VMOVU	(%rsi), %XMM0
> -	VMOVU	-16(%rsi,%rdx), %XMM1
> -	VMOVU	%XMM0, (%rdi)
> -	VMOVU	%XMM1, -16(%rdi,%rdx)
> +	VMOVU	%YMM1, -32(%rdi, %rdx)
>  	VZEROUPPER_RETURN
>  #endif
> +
> +	.p2align 4,, 10
>  L(between_8_15):
>  	/* From 8 to 15.  No branch when size == 8.  */
> -	movq	-8(%rsi,%rdx), %rcx
> +	movq	-8(%rsi, %rdx), %rcx
>  	movq	(%rsi), %rsi
> -	movq	%rcx, -8(%rdi,%rdx)
>  	movq	%rsi, (%rdi)
> +	movq	%rcx, -8(%rdi, %rdx)
>  	ret
> -L(between_4_7):
> -	/* From 4 to 7.  No branch when size == 4.  */
> -	movl	-4(%rsi,%rdx), %ecx
> -	movl	(%rsi), %esi
> -	movl	%ecx, -4(%rdi,%rdx)
> -	movl	%esi, (%rdi)
> -	ret
> -L(between_2_3):
> -	/* From 2 to 3.  No branch when size == 2.  */
> -	movzwl	-2(%rsi,%rdx), %ecx
> -	movzwl	(%rsi), %esi
> -	movw	%cx, -2(%rdi,%rdx)
> -	movw	%si, (%rdi)
> -	ret
>  
> +	.p2align 4,, 10
> +L(last_4x_vec):
> +	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> +
> +	/* VEC(0) and VEC(1) have already been loaded.  */
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> +	VMOVU	%VEC(0), (%rdi)
> +	VMOVU	%VEC(1), VEC_SIZE(%rdi)
> +	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
> +	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  L(movsb_more_2x_vec):
>  	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
>  	ja	L(movsb)
>  #endif
>  L(more_2x_vec):
> -	/* More than 2 * VEC and there may be overlap between destination
> -	   and source.  */
> +	/* More than 2 * VEC and there may be overlap between
> +	   destination and source.  */
>  	cmpq	$(VEC_SIZE * 8), %rdx
>  	ja	L(more_8x_vec)
> +	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
>  	cmpq	$(VEC_SIZE * 4), %rdx
>  	jbe	L(last_4x_vec)
> -	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> -	VMOVU	(%rsi), %VEC(0)
> -	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> +	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
>  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
>  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
> -	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> -	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> -	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
>  	VMOVU	%VEC(0), (%rdi)
>  	VMOVU	%VEC(1), VEC_SIZE(%rdi)
>  	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
>  	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
> -	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
> -	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> -	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> -	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> -	VZEROUPPER_RETURN
> -L(last_4x_vec):
> -	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> -	VMOVU	(%rsi), %VEC(0)
> -	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
> -	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> -	VMOVU	%VEC(0), (%rdi)
> -	VMOVU	%VEC(1), VEC_SIZE(%rdi)
> -	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
> -	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> +	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
> +	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>  	VZEROUPPER_RETURN
>  
> +	.p2align 4,, 4
>  L(more_8x_vec):
> +	movq	%rdi, %rcx
> +	subq	%rsi, %rcx
> +	/* Go to backwards temporal copy if overlap no matter what as
> +	   backward REP MOVSB is slow and we don't want to use NT stores if
> +	   there is overlap.  */
> +	cmpq	%rdx, %rcx
> +	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +	jb	L(more_8x_vec_backward_check_nop)
>  	/* Check if non-temporal move candidate.  */
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>  	/* Check non-temporal store threshold.  */
> -	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
>  	ja	L(large_memcpy_2x)
>  #endif
> -	/* Entry if rdx is greater than non-temporal threshold but there
> -       is overlap.  */
> +	/* To reach this point there cannot be overlap and dst > src. So
> +	   check for overlap and src > dst in which case correctness
> +	   requires forward copy. Otherwise decide between backward/forward
> +	   copy depending on address aliasing.  */
> +
> +	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> +	   but less than __x86_shared_non_temporal_threshold.  */
>  L(more_8x_vec_check):
> -	cmpq	%rsi, %rdi
> -	ja	L(more_8x_vec_backward)
> -	/* Source == destination is less common.  */
> -	je	L(nop)
> -	/* Load the first VEC and last 4 * VEC to support overlapping
> -	   addresses.  */
> -	VMOVU	(%rsi), %VEC(4)
> +	/* rcx contains dst - src. Add back length (rdx).  */
> +	leaq	(%rcx, %rdx), %r8
> +	/* If r8 has different sign than rcx then there is overlap so we
> +	   must do forward copy.  */
> +	xorq	%rcx, %r8
> +	/* Isolate just sign bit of r8.  */
> +	shrq	$63, %r8
> +	/* Get 4k difference dst - src.  */
> +	andl	$(PAGE_SIZE - 256), %ecx
> +	/* If r8 is non-zero must do foward for correctness. Otherwise
> +	   if ecx is non-zero there is 4k False Alaising so do backward
> +	   copy.  */
> +	addl	%r8d, %ecx
> +	jz	L(more_8x_vec_backward)
> +
> +	/* if rdx is greater than __x86_shared_non_temporal_threshold
> +	   but there is overlap, or from short distance movsb.  */
> +L(more_8x_vec_forward):
> +	/* Load first and last 4 * VEC to support overlapping addresses.
> +	 */
> +
> +	/* First vec was already loaded into VEC(0).  */
>  	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
>  	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> +	/* Save begining of dst.  */
> +	movq	%rdi, %rcx
> +	/* Align dst to VEC_SIZE - 1.  */
> +	orq	$(VEC_SIZE - 1), %rdi
>  	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
>  	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> -	/* Save start and stop of the destination buffer.  */
> -	movq	%rdi, %r11
> -	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
> -	/* Align destination for aligned stores in the loop.  Compute
> -	   how much destination is misaligned.  */
> -	movq	%rdi, %r8
> -	andq	$(VEC_SIZE - 1), %r8
> -	/* Get the negative of offset for alignment.  */
> -	subq	$VEC_SIZE, %r8
> -	/* Adjust source.  */
> -	subq	%r8, %rsi
> -	/* Adjust destination which should be aligned now.  */
> -	subq	%r8, %rdi
> -	/* Adjust length.  */
> -	addq	%r8, %rdx
>  
> -	.p2align 4
> +	/* Subtract dst from src. Add back after dst aligned.  */
> +	subq	%rcx, %rsi
> +	/* Finish aligning dst.  */
> +	incq	%rdi
> +	/* Restore src adjusted with new value for aligned dst.  */
> +	addq	%rdi, %rsi
> +	/* Store end of buffer minus tail in rdx.  */
> +	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
> +
> +	/* Dont use multi-byte nop to align.  */
> +	.p2align 4,, 11
>  L(loop_4x_vec_forward):
>  	/* Copy 4 * VEC a time forward.  */
> -	VMOVU	(%rsi), %VEC(0)
> -	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> -	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
> -	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> +	VMOVU	(%rsi), %VEC(1)
> +	VMOVU	VEC_SIZE(%rsi), %VEC(2)
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
>  	subq	$-(VEC_SIZE * 4), %rsi
> -	addq	$-(VEC_SIZE * 4), %rdx
> -	VMOVA	%VEC(0), (%rdi)
> -	VMOVA	%VEC(1), VEC_SIZE(%rdi)
> -	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
> -	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
> +	VMOVA	%VEC(1), (%rdi)
> +	VMOVA	%VEC(2), VEC_SIZE(%rdi)
> +	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
>  	subq	$-(VEC_SIZE * 4), %rdi
> -	cmpq	$(VEC_SIZE * 4), %rdx
> +	cmpq	%rdi, %rdx
>  	ja	L(loop_4x_vec_forward)
>  	/* Store the last 4 * VEC.  */
> -	VMOVU	%VEC(5), (%rcx)
> -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
> -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
> -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
> +	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
> +	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
> +	VMOVU	%VEC(7), VEC_SIZE(%rdx)
> +	VMOVU	%VEC(8), (%rdx)
>  	/* Store the first VEC.  */
> -	VMOVU	%VEC(4), (%r11)
> +	VMOVU	%VEC(0), (%rcx)
> +	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> +	 */
> +L(nop_backward):
>  	VZEROUPPER_RETURN
>  
> +	.p2align 4,, 8
> +L(more_8x_vec_backward_check_nop):
> +	/* rcx contains dst - src. Test for dst == src to skip all of
> +	   memmove.  */
> +	testq	%rcx, %rcx
> +	jz	L(nop_backward)
>  L(more_8x_vec_backward):
>  	/* Load the first 4 * VEC and last VEC to support overlapping
>  	   addresses.  */
> -	VMOVU	(%rsi), %VEC(4)
> +
> +	/* First vec was also loaded into VEC(0).  */
>  	VMOVU	VEC_SIZE(%rsi), %VEC(5)
>  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
> +	/* Begining of region for 4x backward copy stored in rcx.  */
> +	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
>  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
> -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
> -	/* Save stop of the destination buffer.  */
> -	leaq	-VEC_SIZE(%rdi, %rdx), %r11
> -	/* Align destination end for aligned stores in the loop.  Compute
> -	   how much destination end is misaligned.  */
> -	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
> -	movq	%r11, %r9
> -	movq	%r11, %r8
> -	andq	$(VEC_SIZE - 1), %r8
> -	/* Adjust source.  */
> -	subq	%r8, %rcx
> -	/* Adjust the end of destination which should be aligned now.  */
> -	subq	%r8, %r9
> -	/* Adjust length.  */
> -	subq	%r8, %rdx
> -
> -	.p2align 4
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
> +	/* Subtract dst from src. Add back after dst aligned.  */
> +	subq	%rdi, %rsi
> +	/* Align dst.  */
> +	andq	$-(VEC_SIZE), %rcx
> +	/* Restore src.  */
> +	addq	%rcx, %rsi
> +
> +	/* Don't use multi-byte nop to align.  */
> +	.p2align 4,, 11
>  L(loop_4x_vec_backward):
>  	/* Copy 4 * VEC a time backward.  */
> -	VMOVU	(%rcx), %VEC(0)
> -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
> -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
> -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
> -	addq	$-(VEC_SIZE * 4), %rcx
> -	addq	$-(VEC_SIZE * 4), %rdx
> -	VMOVA	%VEC(0), (%r9)
> -	VMOVA	%VEC(1), -VEC_SIZE(%r9)
> -	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
> -	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
> -	addq	$-(VEC_SIZE * 4), %r9
> -	cmpq	$(VEC_SIZE * 4), %rdx
> -	ja	L(loop_4x_vec_backward)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
> +	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
> +	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
> +	addq	$(VEC_SIZE * -4), %rsi
> +	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
> +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
> +	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
> +	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
> +	addq	$(VEC_SIZE * -4), %rcx
> +	cmpq	%rcx, %rdi
> +	jb	L(loop_4x_vec_backward)
>  	/* Store the first 4 * VEC.  */
> -	VMOVU	%VEC(4), (%rdi)
> +	VMOVU	%VEC(0), (%rdi)
>  	VMOVU	%VEC(5), VEC_SIZE(%rdi)
>  	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
>  	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
>  	/* Store the last VEC.  */
> -	VMOVU	%VEC(8), (%r11)
> +	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
> +	VZEROUPPER_RETURN
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> +	/* L(skip_short_movsb_check) is only used with ERMS. Not for
> +	   FSRM.  */
> +	.p2align 5,, 16
> +# if ALIGN_MOVSB
> +L(skip_short_movsb_check):
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +	/* If CPU does not have FSRM two options for aligning. Align src
> +	   if dst and src 4k alias. Otherwise align dst.  */
> +	testl	$(PAGE_SIZE - 512), %ecx
> +	jnz	L(movsb_align_dst)
> +	/* Fall through. dst and src 4k alias. It's better to align src
> +	   here because the bottleneck will be loads dues to the false
> +	   dependency on dst.  */
> +
> +	/* rcx already has dst - src.  */
> +	movq	%rcx, %r9
> +	/* Add src to len. Subtract back after src aligned. -1 because
> +	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
> +	leaq	-1(%rsi, %rdx), %rcx
> +	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> +	orq	$(MOVSB_ALIGN_TO - 1), %rsi
> +	/* Restore dst and len adjusted with new values for aligned dst.
> +	 */
> +	leaq	1(%rsi, %r9), %rdi
> +	subq	%rsi, %rcx
> +	/* Finish aligning src.  */
> +	incq	%rsi
> +
> +	rep	movsb
> +
> +	VMOVU	%VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +	VMOVU	%VEC(1), VEC_SIZE(%r8)
> +#  endif
>  	VZEROUPPER_RETURN
> +# endif
> +
> +	.p2align 4,, 12
> +L(movsb):
> +	movq	%rdi, %rcx
> +	subq	%rsi, %rcx
> +	/* Go to backwards temporal copy if overlap no matter what as
> +	   backward REP MOVSB is slow and we don't want to use NT stores if
> +	   there is overlap.  */
> +	cmpq	%rdx, %rcx
> +	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +	jb	L(more_8x_vec_backward_check_nop)
> +# if ALIGN_MOVSB
> +	/* Save dest for storing aligning VECs later.  */
> +	movq	%rdi, %r8
> +# endif
> +	/* If above __x86_rep_movsb_stop_threshold most likely is
> +	   candidate for NT moves aswell.  */
> +	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> +	jae	L(large_memcpy_2x_check)
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> +	/* Only avoid short movsb if CPU has FSRM.  */
> +	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +	jz	L(skip_short_movsb_check)
> +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
> +	/* Avoid "rep movsb" if RCX, the distance between source and
> +	   destination, is N*4GB + [1..63] with N >= 0.  */
> +
> +	/* ecx contains dst - src. Early check for backward copy
> +	   conditions means only case of slow movsb with src = dst + [0,
> +	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> +	   for that case.  */
> +	cmpl	$-64, %ecx
> +	ja	L(more_8x_vec_forward)
> +#  endif
> +# endif
> +# if ALIGN_MOVSB
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +	/* Fall through means cpu has FSRM. In that case exclusively
> +	   align destination.  */
> +L(movsb_align_dst):
> +	/* Subtract dst from src. Add back after dst aligned.  */
> +	subq	%rdi, %rsi
> +	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> +	addq	$(MOVSB_ALIGN_TO - 1), %rdi
> +	/* Add dst to len. Subtract back after dst aligned.  */
> +	leaq	(%r8, %rdx), %rcx
> +	/* Finish aligning dst.  */
> +	andq	$-(MOVSB_ALIGN_TO), %rdi
> +	/* Restore src and len adjusted with new values for aligned dst.
> +	 */
> +	addq	%rdi, %rsi
> +	subq	%rdi, %rcx
> +
> +	rep	movsb
> +
> +	/* Store VECs loaded for aligning.  */
> +	VMOVU	%VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +	VMOVU	%VEC(1), VEC_SIZE(%r8)
> +#  endif
> +	VZEROUPPER_RETURN
> +# else	/* !ALIGN_MOVSB.  */
> +L(skip_short_movsb_check):
> +	mov	%RDX_LP, %RCX_LP
> +	rep	movsb
> +	ret
> +# endif
> +#endif
>  
> +	.p2align 4,, 10
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -	.p2align 4
> +L(large_memcpy_2x_check):
> +	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
> +	jb	L(more_8x_vec_check)
>  L(large_memcpy_2x):
> -	/* Compute absolute value of difference between source and
> -	   destination.  */
> -	movq	%rdi, %r9
> -	subq	%rsi, %r9
> -	movq	%r9, %r8
> -	leaq	-1(%r9), %rcx
> -	sarq	$63, %r8
> -	xorq	%r8, %r9
> -	subq	%r8, %r9
> -	/* Don't use non-temporal store if there is overlap between
> -	   destination and source since destination may be in cache when
> -	   source is loaded.  */
> -	cmpq	%r9, %rdx
> -	ja	L(more_8x_vec_check)
> +	/* To reach this point it is impossible for dst > src and
> +	   overlap. Remaining to check is src > dst and overlap. rcx
> +	   already contains dst - src. Negate rcx to get src - dst. If
> +	   length > rcx then there is overlap and forward copy is best.  */
> +	negq	%rcx
> +	cmpq	%rcx, %rdx
> +	ja	L(more_8x_vec_forward)
>  
>  	/* Cache align destination. First store the first 64 bytes then
>  	   adjust alignments.  */
> -	VMOVU	(%rsi), %VEC(8)
> -#if VEC_SIZE < 64
> -	VMOVU	VEC_SIZE(%rsi), %VEC(9)
> -#if VEC_SIZE < 32
> -	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
> -	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
> -#endif
> -#endif
> -	VMOVU	%VEC(8), (%rdi)
> -#if VEC_SIZE < 64
> -	VMOVU	%VEC(9), VEC_SIZE(%rdi)
> -#if VEC_SIZE < 32
> -	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
> -	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
> -#endif
> -#endif
> +
> +	/* First vec was also loaded into VEC(0).  */
> +# if VEC_SIZE < 64
> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> +#  if VEC_SIZE < 32
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> +#  endif
> +# endif
> +	VMOVU	%VEC(0), (%rdi)
> +# if VEC_SIZE < 64
> +	VMOVU	%VEC(1), VEC_SIZE(%rdi)
> +#  if VEC_SIZE < 32
> +	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
> +#  endif
> +# endif
> +
>  	/* Adjust source, destination, and size.  */
>  	movq	%rdi, %r8
>  	andq	$63, %r8
> @@ -614,9 +767,13 @@ L(large_memcpy_2x):
>  	/* Adjust length.  */
>  	addq	%r8, %rdx
>  
> -	/* Test if source and destination addresses will alias. If they do
> -	   the larger pipeline in large_memcpy_4x alleviated the
> +	/* Test if source and destination addresses will alias. If they
> +	   do the larger pipeline in large_memcpy_4x alleviated the
>  	   performance drop.  */
> +
> +	/* ecx contains -(dst - src). not ecx will return dst - src - 1
> +	   which works for testing aliasing.  */
> +	notl	%ecx
>  	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
>  	jz	L(large_memcpy_4x)
>  
> @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
>  	/* ecx stores inner loop counter.  */
>  	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
>  L(loop_large_memcpy_4x_inner):
> -	/* Only one prefetch set per page as doing 4 pages give more time
> -	   for prefetcher to keep up.  */
> +	/* Only one prefetch set per page as doing 4 pages give more
> +	   time for prefetcher to keep up.  */
>  	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
>  	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
>  	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> -- 
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-01  5:49 ` [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
@ 2021-11-06  2:31   ` H.J. Lu
  2021-11-06  4:39     ` Noah Goldstein
  0 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06  2:31 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> No bug.
> 
> This patch doubles the rep_movsb_threshold when using ERMS. Based on
> benchmarks the vector copy loop, especially now that it handles 4k
> aliasing, is better for these medium ranged.
> 
> On Skylake with ERMS:
> 
> Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> 4096,   0,      0,      0,      0.975
> 4096,   0,      0,      1,      0.953
> 4096,   12,     0,      0,      0.969
> 4096,   12,     0,      1,      0.872
> 4096,   44,     0,      0,      0.979
> 4096,   44,     0,      1,      0.83
> 4096,   0,      12,     0,      1.006
> 4096,   0,      12,     1,      0.989
> 4096,   0,      44,     0,      0.739
> 4096,   0,      44,     1,      0.942
> 4096,   12,     12,     0,      1.009
> 4096,   12,     12,     1,      0.973
> 4096,   44,     44,     0,      0.791
> 4096,   44,     44,     1,      0.961
> 4096,   2048,   0,      0,      0.978
> 4096,   2048,   0,      1,      0.951
> 4096,   2060,   0,      0,      0.986
> 4096,   2060,   0,      1,      0.963
> 4096,   2048,   12,     0,      0.971
> 4096,   2048,   12,     1,      0.941
> 4096,   2060,   12,     0,      0.977
> 4096,   2060,   12,     1,      0.949
> 8192,   0,      0,      0,      0.85
> 8192,   0,      0,      1,      0.845
> 8192,   13,     0,      0,      0.937
> 8192,   13,     0,      1,      0.939
> 8192,   45,     0,      0,      0.932
> 8192,   45,     0,      1,      0.927
> 8192,   0,      13,     0,      0.621
> 8192,   0,      13,     1,      0.62
> 8192,   0,      45,     0,      0.53
> 8192,   0,      45,     1,      0.516
> 8192,   13,     13,     0,      0.664
> 8192,   13,     13,     1,      0.659
> 8192,   45,     45,     0,      0.593
> 8192,   45,     45,     1,      0.575
> 8192,   2048,   0,      0,      0.854
> 8192,   2048,   0,      1,      0.834
> 8192,   2061,   0,      0,      0.863
> 8192,   2061,   0,      1,      0.857
> 8192,   2048,   13,     0,      0.63
> 8192,   2048,   13,     1,      0.629
> 8192,   2061,   13,     0,      0.627
> 8192,   2061,   13,     1,      0.62
> ---
>  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index e6c94dfd02..712b7c7fd0 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
>      {
> -      rep_movsb_threshold = 2048 * (64 / 16);
> +      rep_movsb_threshold = 4096 * (64 / 16);

Please also update the default of x86_rep_stosb_threshold in

sysdeps/x86/dl-tunables.list

>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 64 * 8;
>  #endif
> @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
>  				    AVX_Fast_Unaligned_Load))
>      {
> -      rep_movsb_threshold = 2048 * (32 / 16);
> +      rep_movsb_threshold = 4096 * (32 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 32 * 8;
>  #endif
>      }
>    else
>      {
> -      rep_movsb_threshold = 2048 * (16 / 16);
> +      rep_movsb_threshold = 4096 * (16 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 16 * 8;
>  #endif
> @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>  
> +
> +
> +

Please don't add these blank lines.

>    unsigned long int rep_movsb_stop_threshold;
>    /* ERMS feature is implemented from AMD Zen3 architecture and it is
>       performing poorly for data above L2 cache size. Henceforth, adding
> -- 
> 2.25.1
> 

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v2 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                   ` (4 preceding siblings ...)
  2021-11-06  2:27 ` [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
@ 2021-11-06  4:39 ` Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
                     ` (3 more replies)
  2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  7 siblings, 4 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06  4:39 UTC (permalink / raw)
  To: libc-alpha

This commit updates the memcpy tests to test both dst > src and dst <
src. This is because there is logic in the code based on the
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
 string/test-memmove.c |  75 ++++++++++++++++++-
 2 files changed, 214 insertions(+), 28 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index c9e965bed3..3b0f3127b7 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -17,6 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #ifndef MEMCPY_RESULT
+# define DO_EXTRA_TESTS
 # define MEMCPY_RESULT(dst, len) dst
 # define MIN_PAGE_SIZE 131072
 # define TEST_MAIN
@@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 static void
 do_test (size_t align1, size_t align2, size_t len)
 {
-  size_t i, j;
+  size_t i, j, repeats;
   char *s1, *s2;
 
   align1 &= 4095;
@@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
 
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      for (i = 0, j = 1; i < len; i++, j += 23)
+        s1[i] = j;
 
-  for (i = 0, j = 1; i < len; i++, j += 23)
-    s1[i] = j;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (impl, s2, s1, len);
+    }
 }
 
 static void
@@ -212,56 +215,87 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (size_t size)
+do_test1 (size_t align1, size_t align2, size_t size)
 {
   void *large_buf;
-  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  size_t mmap_size, region_size;
+
+  align1 &= (page_size - 1);
+  if (align1 == 0)
+    align1 = page_size;
+
+  align2 &= (page_size - 1);
+  if (align2 == 0)
+    align2 = page_size;
+
+  region_size = (size + page_size - 1) & (~(page_size - 1));
+
+  mmap_size = region_size * 2 + 3 * page_size;
+  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
     {
-      puts ("Failed to allocat large_buf, skipping do_test1");
+      puts ("Failed to allocate large_buf, skipping do_test1");
       return;
     }
-
-  if (mprotect (large_buf + size, page_size, PROT_NONE))
+  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
     error (EXIT_FAILURE, errno, "mprotect failed");
 
-  size_t arrary_size = size / sizeof (uint32_t);
-  uint32_t *dest = large_buf;
-  uint32_t *src = large_buf + size + page_size;
+  size_t array_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf + align1;
+  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
   size_t i;
   size_t repeats;
   for(repeats = 0; repeats < 2; repeats++)
     {
-      for (i = 0; i < arrary_size; i++)
+      for (i = 0; i < array_size; i++)
         src[i] = (uint32_t) i;
-
       FOR_EACH_IMPL (impl, 0)
         {
-            printf ("\t\tRunning: %s\n", impl->name);
           memset (dest, -1, size);
           CALL (impl, (char *) dest, (char *) src, size);
-          for (i = 0; i < arrary_size; i++)
+          for (i = 0; i < array_size; i++)
         if (dest[i] != src[i])
           {
             error (0, 0,
                "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
                impl->name, dest, src, i);
             ret = 1;
-            munmap ((void *) large_buf, size * 2 + page_size);
+            munmap ((void *) large_buf, mmap_size);
             return;
           }
         }
-      dest = src;
-      src = large_buf;
+      dest = large_buf + region_size + 2 * page_size + align1;
+      src = large_buf + align2;
+    }
+  munmap ((void *) large_buf, mmap_size);
+}
+
+static void
+do_random_large_tests (void)
+{
+  size_t i, align1, align2, size;
+  for (i = 0; i < 32; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 0x1000000) + 0x200000;
+      do_test1 (align1, align2, size);
+    }
+
+  for (i = 0; i < 128; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 32768) + 4096;
+      do_test1 (align1, align2, size);
     }
-  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j;
 
   test_init ();
 
@@ -298,6 +332,7 @@ test_main (void)
   for (i = 19; i <= 25; ++i)
     {
       do_test (255, 0, 1 << i);
+      do_test (0, 4000, 1 << i);
       do_test (0, 255, i);
       do_test (0, 4000, i);
     }
@@ -306,8 +341,88 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test1 (0x100000);
-  do_test1 (0x2000000);
+  do_test1 (0, 0, 0x100000);
+  do_test1 (0, 0, 0x2000000);
+
+  for (i = 4096; i < 32768; i += 4096)
+    {
+      for (j = 1; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  for (i = 0x300000; i < 0x2000000; i += 0x235689)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+#ifdef DO_EXTRA_TESTS
+  for (i = 0x200000; i <= 0x2000000; i += i)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+
+          do_test1 (0, j, i + 1);
+          do_test1 (4095, j, i + 1);
+          do_test1 (4096 - j, 0, i + 1);
+
+          do_test1 (0, j - 1, i + 1);
+          do_test1 (4095, j - 1, i + 1);
+          do_test1 (4096 - j - 1, 0, i + 1);
+
+          do_test1 (0, j + 1, i + 1);
+          do_test1 (4095, j + 1, i + 1);
+          do_test1 (4096 - j, 1, i + 1);
+
+          do_test1 (0, j, i - 1);
+          do_test1 (4095, j, i - 1);
+          do_test1 (4096 - j, 0, i - 1);
+
+          do_test1 (0, j - 1, i - 1);
+          do_test1 (4095, j - 1, i - 1);
+          do_test1 (4096 - j - 1, 0, i - 1);
+
+          do_test1 (0, j + 1, i - 1);
+          do_test1 (4095, j + 1, i - 1);
+          do_test1 (4096 - j, 1, i - 1);
+        }
+    }
+#endif
+  do_random_large_tests ();
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index a0ce8b0334..5c6d1579e3 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize() - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize() - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
   munmap ((void *) buf, size);
 }
 
+static void
+do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
+{
+  size_t size, repeats, i;
+  uint8_t *buf, *dst, *src;
+
+  size = bytes_move + MAX(offset1, offset2);
+  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANON, -1, 0);
+
+  if (buf == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  dst = &buf[offset1];
+  src = &buf[offset2];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < bytes_move; i++)
+              src[i] = (uint8_t) i;
+#ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+#else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+#endif
+          for (i = 0; i < bytes_move; i++)
+            {
+              if (dst[i] != (uint8_t) i)
+                {
+                  error (0, 0,
+                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                         impl->name, dst, buf, i);
+                  ret = 1;
+                  break;
+                }
+            }
+        }
+      dst = &buf[offset2];
+      src = &buf[offset1];
+    }
+  munmap ((void *) buf, size);
+}
+
+
 int
 test_main (void)
 {
@@ -395,13 +440,39 @@ test_main (void)
 
   do_random_tests ();
 
+  do_test2 (0);
   do_test2 (33);
+  do_test2 (0x200000 - 1);
   do_test2 (0x200000);
+  do_test2 (0x200000 + 1);
+  do_test2 (0x1000000 - 1);
+  do_test2 (0x1000000);
+  do_test2 (0x1000000 + 1);
   do_test2 (0x4000000 - 1);
   do_test2 (0x4000000);
+  do_test2 (0x4000000 + 1);
 
   /* Copy 16KB data.  */
   do_test3 (16384, 3);
+  for (i = 4096; i <= 16384; i <<= 1)
+    {
+      do_test4 (i, 0, i);
+      do_test4 (i, 0, i - 1);
+      do_test4 (i, 0, i + 1);
+      do_test4 (i, 63, i + 63);
+      do_test4 (i, 63, i + 64);
+      do_test4 (i, 63, i);
+
+      do_test4 (i, 0, 1);
+      do_test4 (i, 0, 15);
+      do_test4 (i, 0, 31);
+      do_test4 (i, 0, 63);
+      do_test4 (i, 0, 64);
+      do_test4 (i, 0, 65);
+      do_test4 (i, 0, 127);
+      do_test4 (i, 0, 129);
+    }
+
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v2 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
@ 2021-11-06  4:39   ` Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06  4:39 UTC (permalink / raw)
  To: libc-alpha

This commit adds more benchmarks for the common memcpy/memmove
benchmarks. The most signifcant cases are the half page offsets. The
current versions leaves dst and src near page aligned which leads to
false 4k aliasing on x86_64. This can add noise due to false
dependencies from one run to the next. As well, this seems like more
of an edge case that common case so it shouldn't be the only thing
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 benchtests/bench-memcpy.c  | 49 +++++++++++++++++++++++++++++++++-----
 benchtests/bench-memmove.c | 26 +++++++++++++++++---
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index d9236a2282..744bea26d3 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -40,7 +40,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, const char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -60,11 +63,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i, j;
   char *s1, *s2;
   size_t repeats;
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -99,7 +102,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  size_t half_page = getpagesize () / 2;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -121,8 +124,15 @@ test_main (void)
     {
       do_test (&json_ctx, 0, 0, 1 << i, 1);
       do_test (&json_ctx, i, 0, 1 << i, 1);
+      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
       do_test (&json_ctx, 0, i, 1 << i, 1);
+      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
       do_test (&json_ctx, i, i, 1 << i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
+      do_test (&json_ctx, half_page, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page, i, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
     }
 
   for (i = 0; i < 32; ++i)
@@ -131,16 +141,26 @@ test_main (void)
       do_test (&json_ctx, i, 0, i, 0);
       do_test (&json_ctx, 0, i, i, 0);
       do_test (&json_ctx, i, i, i, 0);
+      do_test (&json_ctx, half_page, 0, i, 0);
+      do_test (&json_ctx, half_page + i, 0, i, 0);
+      do_test (&json_ctx, half_page, i, i, 0);
+      do_test (&json_ctx, half_page + i, i, i, 0);
+      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
+      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
     }
 
   for (i = 3; i < 32; ++i)
     {
       if ((i & (i - 1)) == 0)
-	continue;
+        continue;
       do_test (&json_ctx, 0, 0, 16 * i, 1);
       do_test (&json_ctx, i, 0, 16 * i, 1);
       do_test (&json_ctx, 0, i, 16 * i, 1);
       do_test (&json_ctx, i, i, 16 * i, 1);
+      do_test (&json_ctx, half_page, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page, i, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
     }
 
   for (i = 32; i < 64; ++i)
@@ -149,16 +169,33 @@ test_main (void)
       do_test (&json_ctx, i, 0, 32 * i, 1);
       do_test (&json_ctx, 0, i, 32 * i, 1);
       do_test (&json_ctx, i, i, 32 * i, 1);
+      do_test (&json_ctx, half_page, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page, i, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
     }
 
   do_test (&json_ctx, 0, 0, getpagesize (), 1);
 
-  for (i = 0; i <= 32; ++i)
+  for (i = 0; i <= 48; ++i)
     {
       do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
     }
 
   json_array_end (&json_ctx);
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index 6becbf4782..855f4d0649 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -34,7 +34,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -53,11 +56,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -85,6 +88,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
+  size_t half_page = getpagesize () / 2;
 
   test_init ();
 
@@ -138,6 +142,22 @@ test_main (void)
       do_test (&json_ctx, i, i, 32 * i);
     }
 
+  for (i = 0; i <= 48; ++i)
+    {
+      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
+    }
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v2 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
@ 2021-11-06  4:39   ` Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06  4:39 UTC (permalink / raw)
  To: libc-alpha

This commit adds a new partial overlap benchmark. This is generally
the most interesting performance case for memmove and was missing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 benchtests/bench-memmove-walk.c | 61 +++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
index b5fdb2a422..2fb484c0ba 100644
--- a/benchtests/bench-memmove-walk.c
+++ b/benchtests/bench-memmove-walk.c
@@ -36,6 +36,10 @@
 # define TIMEOUT (20 * 60)
 # include "bench-string.h"
 
+#define NO_OVERLAP 0
+#define PARTIAL_OVERLAP 1
+#define COMPLETE_OVERLAP 2
+
 IMPL (memmove, 1)
 #endif
 
@@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
+do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
 {
-  json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) len);
-  json_array_begin (json_ctx, "timings");
+  char *s1, *s2, *tmp;
+  size_t repeats;
 
-  if (overlap)
-    buf2 = buf1;
+  s1 = (char *) (buf1);
+  s2 = (char *) (buf2);
+  if (overlap != NO_OVERLAP)
+    s2 = s1;
+  if (overlap == PARTIAL_OVERLAP)
+    s2 += len / 2;
 
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_uint (json_ctx, "length", (double) len);
+      json_attr_string(json_ctx, "overlap",
+                       overlap == NO_OVERLAP        ? "none"
+                       : overlap == PARTIAL_OVERLAP ? "partial"
+                                                    : "complete");
+      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
+      json_array_begin (json_ctx, "timings");
+
+
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
 
-  json_array_end (json_ctx);
-  json_element_object_end (json_ctx);
+      json_array_end (json_ctx);
+      json_element_object_end (json_ctx);
+
+      tmp = s1;
+      s1 = s2;
+      s2 = tmp;
+    }
 }
 
 int
@@ -107,15 +131,22 @@ test_main (void)
   /* Non-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, false);
-      do_test (&json_ctx, i + 1, false);
+      do_test (&json_ctx, i, NO_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
+    }
+
+  /* Partially-overlapping buffers.  */
+  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
+    {
+      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
     }
 
-  /* Overlapping buffers.  */
+  /* Complete-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, true);
-      do_test (&json_ctx, i + 1, true);
+      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
+      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
     }
 
   json_array_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v2 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
@ 2021-11-06  4:39   ` Noah Goldstein
  2021-11-06  4:39   ` [PATCH v2 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06  4:39 UTC (permalink / raw)
  To: libc-alpha

No bug.

The optimizations are as follows:

1) Always align entry to 64 bytes. This makes behavior more
   predictable and makes other frontend optimizations easier.

2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
   significant benefits in the case that:
        0 < (dst - src) < [256, 512]

3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
   improvement and for FSRM [-10%, 25%].

In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/memmove.S                      |   2 +-
 .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
 6 files changed, 381 insertions(+), 224 deletions(-)

diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1f..b2b3180848 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
-
+#define MOV_SIZE	3
 #define SECTION(p)		p
 
 #ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 1ec1962e86..67a55f0c85 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f15..975ae6c051 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 848848ab39..0fa7126830 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 0cbce8f944..88715441fe 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index abde8438d4..7b27cbdda5 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@
 # endif
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
 #ifndef PAGE_SIZE
 # define PAGE_SIZE 4096
 #endif
@@ -199,25 +218,21 @@ L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
+	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
+	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
-	cmpb	$8, %dl
+	cmpl	$8, %edx
 	jae	L(between_8_15)
-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
 	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
 	movb	%cl, (%rdi)
-1:
+L(copy_0):
 	ret
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
+	ret
+#endif
+
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
+	ret
+#endif
+
 #if VEC_SIZE > 32
+	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	-32(%rsi, %rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
+
+	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
 	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER_RETURN
-L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 4
 L(more_8x_vec):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
 L(more_8x_vec_check):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
 
-	.p2align 4
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
 
+	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	.p2align 4
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
-	/* Compute absolute value of difference between source and
-	   destination.  */
-	movq	%rdi, %r9
-	subq	%rsi, %r9
-	movq	%r9, %r8
-	leaq	-1(%r9), %rcx
-	sarq	$63, %r8
-	xorq	%r8, %r9
-	subq	%r8, %r9
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache when
-	   source is loaded.  */
-	cmpq	%r9, %rdx
-	ja	L(more_8x_vec_check)
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
 
 	/* Cache align destination. First store the first 64 bytes then
 	   adjust alignments.  */
-	VMOVU	(%rsi), %VEC(8)
-#if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
-	VMOVU	%VEC(8), (%rdi)
-#if VEC_SIZE < 64
-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
 	/* Adjust source, destination, and size.  */
 	movq	%rdi, %r8
 	andq	$63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
 	/* Adjust length.  */
 	addq	%r8, %rdx
 
-	/* Test if source and destination addresses will alias. If they do
-	   the larger pipeline in large_memcpy_4x alleviated the
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
 	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
 
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
 	/* ecx stores inner loop counter.  */
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 L(loop_large_memcpy_4x_inner):
-	/* Only one prefetch set per page as doing 4 pages give more time
-	   for prefetcher to keep up.  */
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v2 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2021-11-06  4:39   ` [PATCH v2 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-11-06  4:39   ` Noah Goldstein
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06  4:39 UTC (permalink / raw)
  To: libc-alpha

No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62
---
 sysdeps/x86/dl-cacheinfo.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd02..b3d3ced5a7 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
     }
   else
     {
-      rep_movsb_threshold = 2048 * (16 / 16);
+      rep_movsb_threshold = 4096 * (16 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 16 * 8;
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06  2:31   ` H.J. Lu
@ 2021-11-06  4:39     ` Noah Goldstein
  2021-11-06 12:04       ` H.J. Lu
  0 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06  4:39 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Nov 5, 2021 at 9:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> > No bug.
> >
> > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > benchmarks the vector copy loop, especially now that it handles 4k
> > aliasing, is better for these medium ranged.
> >
> > On Skylake with ERMS:
> >
> > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > 4096,   0,      0,      0,      0.975
> > 4096,   0,      0,      1,      0.953
> > 4096,   12,     0,      0,      0.969
> > 4096,   12,     0,      1,      0.872
> > 4096,   44,     0,      0,      0.979
> > 4096,   44,     0,      1,      0.83
> > 4096,   0,      12,     0,      1.006
> > 4096,   0,      12,     1,      0.989
> > 4096,   0,      44,     0,      0.739
> > 4096,   0,      44,     1,      0.942
> > 4096,   12,     12,     0,      1.009
> > 4096,   12,     12,     1,      0.973
> > 4096,   44,     44,     0,      0.791
> > 4096,   44,     44,     1,      0.961
> > 4096,   2048,   0,      0,      0.978
> > 4096,   2048,   0,      1,      0.951
> > 4096,   2060,   0,      0,      0.986
> > 4096,   2060,   0,      1,      0.963
> > 4096,   2048,   12,     0,      0.971
> > 4096,   2048,   12,     1,      0.941
> > 4096,   2060,   12,     0,      0.977
> > 4096,   2060,   12,     1,      0.949
> > 8192,   0,      0,      0,      0.85
> > 8192,   0,      0,      1,      0.845
> > 8192,   13,     0,      0,      0.937
> > 8192,   13,     0,      1,      0.939
> > 8192,   45,     0,      0,      0.932
> > 8192,   45,     0,      1,      0.927
> > 8192,   0,      13,     0,      0.621
> > 8192,   0,      13,     1,      0.62
> > 8192,   0,      45,     0,      0.53
> > 8192,   0,      45,     1,      0.516
> > 8192,   13,     13,     0,      0.664
> > 8192,   13,     13,     1,      0.659
> > 8192,   45,     45,     0,      0.593
> > 8192,   45,     45,     1,      0.575
> > 8192,   2048,   0,      0,      0.854
> > 8192,   2048,   0,      1,      0.834
> > 8192,   2061,   0,      0,      0.863
> > 8192,   2061,   0,      1,      0.857
> > 8192,   2048,   13,     0,      0.63
> > 8192,   2048,   13,     1,      0.629
> > 8192,   2061,   13,     0,      0.627
> > 8192,   2061,   13,     1,      0.62
> > ---
> >  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
> >  1 file changed, 6 insertions(+), 3 deletions(-)
> >
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index e6c94dfd02..712b7c7fd0 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> >      {
> > -      rep_movsb_threshold = 2048 * (64 / 16);
> > +      rep_movsb_threshold = 4096 * (64 / 16);
>
> Please also update the default of x86_rep_stosb_threshold in

Do you know what to set it at?

I haven't tested recently but last time I checked stosb was significantly
better even for smaller values than movsb. Think it warrants another patch
as the numbers in this commit are for movsb and I don't think the two are
necessarily 1-1.

>
> sysdeps/x86/dl-tunables.list
>
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 64 * 8;
> >  #endif
> > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> >                                   AVX_Fast_Unaligned_Load))
> >      {
> > -      rep_movsb_threshold = 2048 * (32 / 16);
> > +      rep_movsb_threshold = 4096 * (32 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 32 * 8;
> >  #endif
> >      }
> >    else
> >      {
> > -      rep_movsb_threshold = 2048 * (16 / 16);
> > +      rep_movsb_threshold = 4096 * (16 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 16 * 8;
> >  #endif
> > @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> >      rep_movsb_threshold = 2112;
> >
> > +
> > +
> > +
>
> Please don't add these blank lines.
Fixed.


>
> >    unsigned long int rep_movsb_stop_threshold;
> >    /* ERMS feature is implemented from AMD Zen3 architecture and it is
> >       performing poorly for data above L2 cache size. Henceforth, adding
> > --
> > 2.25.1
> >
>
> Thanks.
>
> H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06  4:39     ` Noah Goldstein
@ 2021-11-06 12:04       ` H.J. Lu
  2021-11-06 17:38         ` Noah Goldstein
  0 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 12:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Nov 5, 2021 at 9:39 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Nov 5, 2021 at 9:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> > > No bug.
> > >
> > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > benchmarks the vector copy loop, especially now that it handles 4k
> > > aliasing, is better for these medium ranged.
> > >
> > > On Skylake with ERMS:
> > >
> > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > 4096,   0,      0,      0,      0.975
> > > 4096,   0,      0,      1,      0.953
> > > 4096,   12,     0,      0,      0.969
> > > 4096,   12,     0,      1,      0.872
> > > 4096,   44,     0,      0,      0.979
> > > 4096,   44,     0,      1,      0.83
> > > 4096,   0,      12,     0,      1.006
> > > 4096,   0,      12,     1,      0.989
> > > 4096,   0,      44,     0,      0.739
> > > 4096,   0,      44,     1,      0.942
> > > 4096,   12,     12,     0,      1.009
> > > 4096,   12,     12,     1,      0.973
> > > 4096,   44,     44,     0,      0.791
> > > 4096,   44,     44,     1,      0.961
> > > 4096,   2048,   0,      0,      0.978
> > > 4096,   2048,   0,      1,      0.951
> > > 4096,   2060,   0,      0,      0.986
> > > 4096,   2060,   0,      1,      0.963
> > > 4096,   2048,   12,     0,      0.971
> > > 4096,   2048,   12,     1,      0.941
> > > 4096,   2060,   12,     0,      0.977
> > > 4096,   2060,   12,     1,      0.949
> > > 8192,   0,      0,      0,      0.85
> > > 8192,   0,      0,      1,      0.845
> > > 8192,   13,     0,      0,      0.937
> > > 8192,   13,     0,      1,      0.939
> > > 8192,   45,     0,      0,      0.932
> > > 8192,   45,     0,      1,      0.927
> > > 8192,   0,      13,     0,      0.621
> > > 8192,   0,      13,     1,      0.62
> > > 8192,   0,      45,     0,      0.53
> > > 8192,   0,      45,     1,      0.516
> > > 8192,   13,     13,     0,      0.664
> > > 8192,   13,     13,     1,      0.659
> > > 8192,   45,     45,     0,      0.593
> > > 8192,   45,     45,     1,      0.575
> > > 8192,   2048,   0,      0,      0.854
> > > 8192,   2048,   0,      1,      0.834
> > > 8192,   2061,   0,      0,      0.863
> > > 8192,   2061,   0,      1,      0.857
> > > 8192,   2048,   13,     0,      0.63
> > > 8192,   2048,   13,     1,      0.629
> > > 8192,   2061,   13,     0,      0.627
> > > 8192,   2061,   13,     1,      0.62
> > > ---
> > >  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
> > >  1 file changed, 6 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > index e6c94dfd02..712b7c7fd0 100644
> > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > +      rep_movsb_threshold = 4096 * (64 / 16);
> >
> > Please also update the default of x86_rep_stosb_threshold in
>
> Do you know what to set it at?

Oops.  I meant

    x86_rep_movsb_threshold {
      type: SIZE_T
      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
      # isn't faster on short data.  The memcpy micro benchmark in glibc
      # shows that 2KB is the approximate value above which REP MOVSB
      # becomes faster than SSE2 optimization on processors with Enhanced
      # REP MOVSB.  Since larger register size can move more data with a
      # single load and store, the threshold is higher with larger register
      # size.  Note: Since the REP MOVSB threshold must be greater than 8
      # times of vector size and the default value is 2048 * (vector size

       ^^^^^^^
      # / 16), the default value and the minimum value must be updated at
      # run-time.  NB: Don't set the default value since we can't tell if
      # the tunable value is set by user or not [BZ #27069].
      minval: 1
    }

> I haven't tested recently but last time I checked stosb was significantly
> better even for smaller values than movsb. Think it warrants another patch
> as the numbers in this commit are for movsb and I don't think the two are
> necessarily 1-1.
>
> >
> > sysdeps/x86/dl-tunables.list
> >
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 64 * 8;
> > >  #endif
> > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > >                                   AVX_Fast_Unaligned_Load))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 32 * 8;
> > >  #endif
> > >      }
> > >    else
> > >      {
> > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 16 * 8;
> > >  #endif
> > > @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> > >      rep_movsb_threshold = 2112;
> > >
> > > +
> > > +
> > > +
> >
> > Please don't add these blank lines.
> Fixed.
>
>
> >
> > >    unsigned long int rep_movsb_stop_threshold;
> > >    /* ERMS feature is implemented from AMD Zen3 architecture and it is
> > >       performing poorly for data above L2 cache size. Henceforth, adding
> > > --
> > > 2.25.1
> > >
> >
> > Thanks.
> >
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                   ` (5 preceding siblings ...)
  2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
@ 2021-11-06 17:37 ` Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
                     ` (3 more replies)
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  7 siblings, 4 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 17:37 UTC (permalink / raw)
  To: libc-alpha

This commit updates the memcpy tests to test both dst > src and dst <
src. This is because there is logic in the code based on the
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
 string/test-memmove.c |  75 ++++++++++++++++++-
 2 files changed, 214 insertions(+), 28 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index c9e965bed3..3b0f3127b7 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -17,6 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #ifndef MEMCPY_RESULT
+# define DO_EXTRA_TESTS
 # define MEMCPY_RESULT(dst, len) dst
 # define MIN_PAGE_SIZE 131072
 # define TEST_MAIN
@@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 static void
 do_test (size_t align1, size_t align2, size_t len)
 {
-  size_t i, j;
+  size_t i, j, repeats;
   char *s1, *s2;
 
   align1 &= 4095;
@@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
 
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      for (i = 0, j = 1; i < len; i++, j += 23)
+        s1[i] = j;
 
-  for (i = 0, j = 1; i < len; i++, j += 23)
-    s1[i] = j;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (impl, s2, s1, len);
+    }
 }
 
 static void
@@ -212,56 +215,87 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (size_t size)
+do_test1 (size_t align1, size_t align2, size_t size)
 {
   void *large_buf;
-  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  size_t mmap_size, region_size;
+
+  align1 &= (page_size - 1);
+  if (align1 == 0)
+    align1 = page_size;
+
+  align2 &= (page_size - 1);
+  if (align2 == 0)
+    align2 = page_size;
+
+  region_size = (size + page_size - 1) & (~(page_size - 1));
+
+  mmap_size = region_size * 2 + 3 * page_size;
+  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
     {
-      puts ("Failed to allocat large_buf, skipping do_test1");
+      puts ("Failed to allocate large_buf, skipping do_test1");
       return;
     }
-
-  if (mprotect (large_buf + size, page_size, PROT_NONE))
+  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
     error (EXIT_FAILURE, errno, "mprotect failed");
 
-  size_t arrary_size = size / sizeof (uint32_t);
-  uint32_t *dest = large_buf;
-  uint32_t *src = large_buf + size + page_size;
+  size_t array_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf + align1;
+  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
   size_t i;
   size_t repeats;
   for(repeats = 0; repeats < 2; repeats++)
     {
-      for (i = 0; i < arrary_size; i++)
+      for (i = 0; i < array_size; i++)
         src[i] = (uint32_t) i;
-
       FOR_EACH_IMPL (impl, 0)
         {
-            printf ("\t\tRunning: %s\n", impl->name);
           memset (dest, -1, size);
           CALL (impl, (char *) dest, (char *) src, size);
-          for (i = 0; i < arrary_size; i++)
+          for (i = 0; i < array_size; i++)
         if (dest[i] != src[i])
           {
             error (0, 0,
                "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
                impl->name, dest, src, i);
             ret = 1;
-            munmap ((void *) large_buf, size * 2 + page_size);
+            munmap ((void *) large_buf, mmap_size);
             return;
           }
         }
-      dest = src;
-      src = large_buf;
+      dest = large_buf + region_size + 2 * page_size + align1;
+      src = large_buf + align2;
+    }
+  munmap ((void *) large_buf, mmap_size);
+}
+
+static void
+do_random_large_tests (void)
+{
+  size_t i, align1, align2, size;
+  for (i = 0; i < 32; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 0x1000000) + 0x200000;
+      do_test1 (align1, align2, size);
+    }
+
+  for (i = 0; i < 128; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 32768) + 4096;
+      do_test1 (align1, align2, size);
     }
-  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j;
 
   test_init ();
 
@@ -298,6 +332,7 @@ test_main (void)
   for (i = 19; i <= 25; ++i)
     {
       do_test (255, 0, 1 << i);
+      do_test (0, 4000, 1 << i);
       do_test (0, 255, i);
       do_test (0, 4000, i);
     }
@@ -306,8 +341,88 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test1 (0x100000);
-  do_test1 (0x2000000);
+  do_test1 (0, 0, 0x100000);
+  do_test1 (0, 0, 0x2000000);
+
+  for (i = 4096; i < 32768; i += 4096)
+    {
+      for (j = 1; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  for (i = 0x300000; i < 0x2000000; i += 0x235689)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+#ifdef DO_EXTRA_TESTS
+  for (i = 0x200000; i <= 0x2000000; i += i)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+
+          do_test1 (0, j, i + 1);
+          do_test1 (4095, j, i + 1);
+          do_test1 (4096 - j, 0, i + 1);
+
+          do_test1 (0, j - 1, i + 1);
+          do_test1 (4095, j - 1, i + 1);
+          do_test1 (4096 - j - 1, 0, i + 1);
+
+          do_test1 (0, j + 1, i + 1);
+          do_test1 (4095, j + 1, i + 1);
+          do_test1 (4096 - j, 1, i + 1);
+
+          do_test1 (0, j, i - 1);
+          do_test1 (4095, j, i - 1);
+          do_test1 (4096 - j, 0, i - 1);
+
+          do_test1 (0, j - 1, i - 1);
+          do_test1 (4095, j - 1, i - 1);
+          do_test1 (4096 - j - 1, 0, i - 1);
+
+          do_test1 (0, j + 1, i - 1);
+          do_test1 (4095, j + 1, i - 1);
+          do_test1 (4096 - j, 1, i - 1);
+        }
+    }
+#endif
+  do_random_large_tests ();
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index a0ce8b0334..5c6d1579e3 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize() - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize() - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
   munmap ((void *) buf, size);
 }
 
+static void
+do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
+{
+  size_t size, repeats, i;
+  uint8_t *buf, *dst, *src;
+
+  size = bytes_move + MAX(offset1, offset2);
+  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANON, -1, 0);
+
+  if (buf == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  dst = &buf[offset1];
+  src = &buf[offset2];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < bytes_move; i++)
+              src[i] = (uint8_t) i;
+#ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+#else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+#endif
+          for (i = 0; i < bytes_move; i++)
+            {
+              if (dst[i] != (uint8_t) i)
+                {
+                  error (0, 0,
+                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                         impl->name, dst, buf, i);
+                  ret = 1;
+                  break;
+                }
+            }
+        }
+      dst = &buf[offset2];
+      src = &buf[offset1];
+    }
+  munmap ((void *) buf, size);
+}
+
+
 int
 test_main (void)
 {
@@ -395,13 +440,39 @@ test_main (void)
 
   do_random_tests ();
 
+  do_test2 (0);
   do_test2 (33);
+  do_test2 (0x200000 - 1);
   do_test2 (0x200000);
+  do_test2 (0x200000 + 1);
+  do_test2 (0x1000000 - 1);
+  do_test2 (0x1000000);
+  do_test2 (0x1000000 + 1);
   do_test2 (0x4000000 - 1);
   do_test2 (0x4000000);
+  do_test2 (0x4000000 + 1);
 
   /* Copy 16KB data.  */
   do_test3 (16384, 3);
+  for (i = 4096; i <= 16384; i <<= 1)
+    {
+      do_test4 (i, 0, i);
+      do_test4 (i, 0, i - 1);
+      do_test4 (i, 0, i + 1);
+      do_test4 (i, 63, i + 63);
+      do_test4 (i, 63, i + 64);
+      do_test4 (i, 63, i);
+
+      do_test4 (i, 0, 1);
+      do_test4 (i, 0, 15);
+      do_test4 (i, 0, 31);
+      do_test4 (i, 0, 63);
+      do_test4 (i, 0, 64);
+      do_test4 (i, 0, 65);
+      do_test4 (i, 0, 127);
+      do_test4 (i, 0, 129);
+    }
+
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v3 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
@ 2021-11-06 17:37   ` Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 17:37 UTC (permalink / raw)
  To: libc-alpha

This commit adds more benchmarks for the common memcpy/memmove
benchmarks. The most signifcant cases are the half page offsets. The
current versions leaves dst and src near page aligned which leads to
false 4k aliasing on x86_64. This can add noise due to false
dependencies from one run to the next. As well, this seems like more
of an edge case that common case so it shouldn't be the only thing
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 benchtests/bench-memcpy.c  | 49 +++++++++++++++++++++++++++++++++-----
 benchtests/bench-memmove.c | 26 +++++++++++++++++---
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index d9236a2282..744bea26d3 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -40,7 +40,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, const char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -60,11 +63,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i, j;
   char *s1, *s2;
   size_t repeats;
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -99,7 +102,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  size_t half_page = getpagesize () / 2;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -121,8 +124,15 @@ test_main (void)
     {
       do_test (&json_ctx, 0, 0, 1 << i, 1);
       do_test (&json_ctx, i, 0, 1 << i, 1);
+      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
       do_test (&json_ctx, 0, i, 1 << i, 1);
+      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
       do_test (&json_ctx, i, i, 1 << i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
+      do_test (&json_ctx, half_page, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page, i, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
     }
 
   for (i = 0; i < 32; ++i)
@@ -131,16 +141,26 @@ test_main (void)
       do_test (&json_ctx, i, 0, i, 0);
       do_test (&json_ctx, 0, i, i, 0);
       do_test (&json_ctx, i, i, i, 0);
+      do_test (&json_ctx, half_page, 0, i, 0);
+      do_test (&json_ctx, half_page + i, 0, i, 0);
+      do_test (&json_ctx, half_page, i, i, 0);
+      do_test (&json_ctx, half_page + i, i, i, 0);
+      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
+      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
     }
 
   for (i = 3; i < 32; ++i)
     {
       if ((i & (i - 1)) == 0)
-	continue;
+        continue;
       do_test (&json_ctx, 0, 0, 16 * i, 1);
       do_test (&json_ctx, i, 0, 16 * i, 1);
       do_test (&json_ctx, 0, i, 16 * i, 1);
       do_test (&json_ctx, i, i, 16 * i, 1);
+      do_test (&json_ctx, half_page, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page, i, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
     }
 
   for (i = 32; i < 64; ++i)
@@ -149,16 +169,33 @@ test_main (void)
       do_test (&json_ctx, i, 0, 32 * i, 1);
       do_test (&json_ctx, 0, i, 32 * i, 1);
       do_test (&json_ctx, i, i, 32 * i, 1);
+      do_test (&json_ctx, half_page, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page, i, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
     }
 
   do_test (&json_ctx, 0, 0, getpagesize (), 1);
 
-  for (i = 0; i <= 32; ++i)
+  for (i = 0; i <= 48; ++i)
     {
       do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
     }
 
   json_array_end (&json_ctx);
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index 6becbf4782..855f4d0649 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -34,7 +34,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -53,11 +56,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -85,6 +88,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
+  size_t half_page = getpagesize () / 2;
 
   test_init ();
 
@@ -138,6 +142,22 @@ test_main (void)
       do_test (&json_ctx, i, i, 32 * i);
     }
 
+  for (i = 0; i <= 48; ++i)
+    {
+      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
+    }
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v3 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
@ 2021-11-06 17:37   ` Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 17:37 UTC (permalink / raw)
  To: libc-alpha

This commit adds a new partial overlap benchmark. This is generally
the most interesting performance case for memmove and was missing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 benchtests/bench-memmove-walk.c | 61 +++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
index b5fdb2a422..2fb484c0ba 100644
--- a/benchtests/bench-memmove-walk.c
+++ b/benchtests/bench-memmove-walk.c
@@ -36,6 +36,10 @@
 # define TIMEOUT (20 * 60)
 # include "bench-string.h"
 
+#define NO_OVERLAP 0
+#define PARTIAL_OVERLAP 1
+#define COMPLETE_OVERLAP 2
+
 IMPL (memmove, 1)
 #endif
 
@@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
+do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
 {
-  json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) len);
-  json_array_begin (json_ctx, "timings");
+  char *s1, *s2, *tmp;
+  size_t repeats;
 
-  if (overlap)
-    buf2 = buf1;
+  s1 = (char *) (buf1);
+  s2 = (char *) (buf2);
+  if (overlap != NO_OVERLAP)
+    s2 = s1;
+  if (overlap == PARTIAL_OVERLAP)
+    s2 += len / 2;
 
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_uint (json_ctx, "length", (double) len);
+      json_attr_string(json_ctx, "overlap",
+                       overlap == NO_OVERLAP        ? "none"
+                       : overlap == PARTIAL_OVERLAP ? "partial"
+                                                    : "complete");
+      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
+      json_array_begin (json_ctx, "timings");
+
+
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
 
-  json_array_end (json_ctx);
-  json_element_object_end (json_ctx);
+      json_array_end (json_ctx);
+      json_element_object_end (json_ctx);
+
+      tmp = s1;
+      s1 = s2;
+      s2 = tmp;
+    }
 }
 
 int
@@ -107,15 +131,22 @@ test_main (void)
   /* Non-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, false);
-      do_test (&json_ctx, i + 1, false);
+      do_test (&json_ctx, i, NO_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
+    }
+
+  /* Partially-overlapping buffers.  */
+  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
+    {
+      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
     }
 
-  /* Overlapping buffers.  */
+  /* Complete-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, true);
-      do_test (&json_ctx, i + 1, true);
+      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
+      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
     }
 
   json_array_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v3 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
@ 2021-11-06 17:37   ` Noah Goldstein
  2021-11-06 17:37   ` [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
  3 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 17:37 UTC (permalink / raw)
  To: libc-alpha

No bug.

The optimizations are as follows:

1) Always align entry to 64 bytes. This makes behavior more
   predictable and makes other frontend optimizations easier.

2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
   significant benefits in the case that:
        0 < (dst - src) < [256, 512]

3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
   improvement and for FSRM [-10%, 25%].

In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/memmove.S                      |   2 +-
 .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
 6 files changed, 381 insertions(+), 224 deletions(-)

diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1f..b2b3180848 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
-
+#define MOV_SIZE	3
 #define SECTION(p)		p
 
 #ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 1ec1962e86..67a55f0c85 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f15..975ae6c051 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 848848ab39..0fa7126830 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 0cbce8f944..88715441fe 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index abde8438d4..7b27cbdda5 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@
 # endif
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
 #ifndef PAGE_SIZE
 # define PAGE_SIZE 4096
 #endif
@@ -199,25 +218,21 @@ L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
+	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
+	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
-	cmpb	$8, %dl
+	cmpl	$8, %edx
 	jae	L(between_8_15)
-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
 	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
 	movb	%cl, (%rdi)
-1:
+L(copy_0):
 	ret
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
+	ret
+#endif
+
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
+	ret
+#endif
+
 #if VEC_SIZE > 32
+	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	-32(%rsi, %rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
+
+	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
 	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER_RETURN
-L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 4
 L(more_8x_vec):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
 L(more_8x_vec_check):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
 
-	.p2align 4
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
 
+	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	.p2align 4
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
-	/* Compute absolute value of difference between source and
-	   destination.  */
-	movq	%rdi, %r9
-	subq	%rsi, %r9
-	movq	%r9, %r8
-	leaq	-1(%r9), %rcx
-	sarq	$63, %r8
-	xorq	%r8, %r9
-	subq	%r8, %r9
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache when
-	   source is loaded.  */
-	cmpq	%r9, %rdx
-	ja	L(more_8x_vec_check)
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
 
 	/* Cache align destination. First store the first 64 bytes then
 	   adjust alignments.  */
-	VMOVU	(%rsi), %VEC(8)
-#if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
-	VMOVU	%VEC(8), (%rdi)
-#if VEC_SIZE < 64
-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
 	/* Adjust source, destination, and size.  */
 	movq	%rdi, %r8
 	andq	$63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
 	/* Adjust length.  */
 	addq	%r8, %rdx
 
-	/* Test if source and destination addresses will alias. If they do
-	   the larger pipeline in large_memcpy_4x alleviated the
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
 	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
 
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
 	/* ecx stores inner loop counter.  */
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 L(loop_large_memcpy_4x_inner):
-	/* Only one prefetch set per page as doing 4 pages give more time
-	   for prefetcher to keep up.  */
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                     ` (2 preceding siblings ...)
  2021-11-06 17:37   ` [PATCH v3 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-11-06 17:37   ` Noah Goldstein
  2021-11-06 17:56     ` H.J. Lu
  3 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 17:37 UTC (permalink / raw)
  To: libc-alpha

No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62
---
 sysdeps/x86/dl-cacheinfo.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd02..ceb3b53828 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
     }
   else
     {
-      rep_movsb_threshold = 2048 * (16 / 16);
+      rep_movsb_threshold = 4096 * (16 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 16 * 8;
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 12:04       ` H.J. Lu
@ 2021-11-06 17:38         ` Noah Goldstein
  0 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 17:38 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 7:05 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 5, 2021 at 9:39 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Nov 5, 2021 at 9:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> > > > No bug.
> > > >
> > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > > benchmarks the vector copy loop, especially now that it handles 4k
> > > > aliasing, is better for these medium ranged.
> > > >
> > > > On Skylake with ERMS:
> > > >
> > > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > > 4096,   0,      0,      0,      0.975
> > > > 4096,   0,      0,      1,      0.953
> > > > 4096,   12,     0,      0,      0.969
> > > > 4096,   12,     0,      1,      0.872
> > > > 4096,   44,     0,      0,      0.979
> > > > 4096,   44,     0,      1,      0.83
> > > > 4096,   0,      12,     0,      1.006
> > > > 4096,   0,      12,     1,      0.989
> > > > 4096,   0,      44,     0,      0.739
> > > > 4096,   0,      44,     1,      0.942
> > > > 4096,   12,     12,     0,      1.009
> > > > 4096,   12,     12,     1,      0.973
> > > > 4096,   44,     44,     0,      0.791
> > > > 4096,   44,     44,     1,      0.961
> > > > 4096,   2048,   0,      0,      0.978
> > > > 4096,   2048,   0,      1,      0.951
> > > > 4096,   2060,   0,      0,      0.986
> > > > 4096,   2060,   0,      1,      0.963
> > > > 4096,   2048,   12,     0,      0.971
> > > > 4096,   2048,   12,     1,      0.941
> > > > 4096,   2060,   12,     0,      0.977
> > > > 4096,   2060,   12,     1,      0.949
> > > > 8192,   0,      0,      0,      0.85
> > > > 8192,   0,      0,      1,      0.845
> > > > 8192,   13,     0,      0,      0.937
> > > > 8192,   13,     0,      1,      0.939
> > > > 8192,   45,     0,      0,      0.932
> > > > 8192,   45,     0,      1,      0.927
> > > > 8192,   0,      13,     0,      0.621
> > > > 8192,   0,      13,     1,      0.62
> > > > 8192,   0,      45,     0,      0.53
> > > > 8192,   0,      45,     1,      0.516
> > > > 8192,   13,     13,     0,      0.664
> > > > 8192,   13,     13,     1,      0.659
> > > > 8192,   45,     45,     0,      0.593
> > > > 8192,   45,     45,     1,      0.575
> > > > 8192,   2048,   0,      0,      0.854
> > > > 8192,   2048,   0,      1,      0.834
> > > > 8192,   2061,   0,      0,      0.863
> > > > 8192,   2061,   0,      1,      0.857
> > > > 8192,   2048,   13,     0,      0.63
> > > > 8192,   2048,   13,     1,      0.629
> > > > 8192,   2061,   13,     0,      0.627
> > > > 8192,   2061,   13,     1,      0.62
> > > > ---
> > > >  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
> > > >  1 file changed, 6 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > > index e6c94dfd02..712b7c7fd0 100644
> > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > > +      rep_movsb_threshold = 4096 * (64 / 16);
> > >
> > > Please also update the default of x86_rep_stosb_threshold in
> >
> > Do you know what to set it at?
>
> Oops.  I meant

ah. Fixed.

>
>     x86_rep_movsb_threshold {
>       type: SIZE_T
>       # Since there is overhead to set up REP MOVSB operation, REP MOVSB
>       # isn't faster on short data.  The memcpy micro benchmark in glibc
>       # shows that 2KB is the approximate value above which REP MOVSB
>       # becomes faster than SSE2 optimization on processors with Enhanced
>       # REP MOVSB.  Since larger register size can move more data with a
>       # single load and store, the threshold is higher with larger register
>       # size.  Note: Since the REP MOVSB threshold must be greater than 8
>       # times of vector size and the default value is 2048 * (vector size
>
>        ^^^^^^^
>       # / 16), the default value and the minimum value must be updated at
>       # run-time.  NB: Don't set the default value since we can't tell if
>       # the tunable value is set by user or not [BZ #27069].
>       minval: 1
>     }
>
> > I haven't tested recently but last time I checked stosb was significantly
> > better even for smaller values than movsb. Think it warrants another patch
> > as the numbers in this commit are for movsb and I don't think the two are
> > necessarily 1-1.
> >
> > >
> > > sysdeps/x86/dl-tunables.list
> > >
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 64 * 8;
> > > >  #endif
> > > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > > >                                   AVX_Fast_Unaligned_Load))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 32 * 8;
> > > >  #endif
> > > >      }
> > > >    else
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 16 * 8;
> > > >  #endif
> > > > @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> > > >      rep_movsb_threshold = 2112;
> > > >
> > > > +
> > > > +
> > > > +
> > >
> > > Please don't add these blank lines.
> > Fixed.
> >
> >
> > >
> > > >    unsigned long int rep_movsb_stop_threshold;
> > > >    /* ERMS feature is implemented from AMD Zen3 architecture and it is
> > > >       performing poorly for data above L2 cache size. Henceforth, adding
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > Thanks.
> > >
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 17:37   ` [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
@ 2021-11-06 17:56     ` H.J. Lu
  2021-11-06 18:11       ` Noah Goldstein
  0 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 17:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> No bug.
>
> This patch doubles the rep_movsb_threshold when using ERMS. Based on
> benchmarks the vector copy loop, especially now that it handles 4k
> aliasing, is better for these medium ranged.
>
> On Skylake with ERMS:
>
> Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> 4096,   0,      0,      0,      0.975
> 4096,   0,      0,      1,      0.953
> 4096,   12,     0,      0,      0.969
> 4096,   12,     0,      1,      0.872
> 4096,   44,     0,      0,      0.979
> 4096,   44,     0,      1,      0.83
> 4096,   0,      12,     0,      1.006
> 4096,   0,      12,     1,      0.989
> 4096,   0,      44,     0,      0.739
> 4096,   0,      44,     1,      0.942
> 4096,   12,     12,     0,      1.009
> 4096,   12,     12,     1,      0.973
> 4096,   44,     44,     0,      0.791
> 4096,   44,     44,     1,      0.961
> 4096,   2048,   0,      0,      0.978
> 4096,   2048,   0,      1,      0.951
> 4096,   2060,   0,      0,      0.986
> 4096,   2060,   0,      1,      0.963
> 4096,   2048,   12,     0,      0.971
> 4096,   2048,   12,     1,      0.941
> 4096,   2060,   12,     0,      0.977
> 4096,   2060,   12,     1,      0.949
> 8192,   0,      0,      0,      0.85
> 8192,   0,      0,      1,      0.845
> 8192,   13,     0,      0,      0.937
> 8192,   13,     0,      1,      0.939
> 8192,   45,     0,      0,      0.932
> 8192,   45,     0,      1,      0.927
> 8192,   0,      13,     0,      0.621
> 8192,   0,      13,     1,      0.62
> 8192,   0,      45,     0,      0.53
> 8192,   0,      45,     1,      0.516
> 8192,   13,     13,     0,      0.664
> 8192,   13,     13,     1,      0.659
> 8192,   45,     45,     0,      0.593
> 8192,   45,     45,     1,      0.575
> 8192,   2048,   0,      0,      0.854
> 8192,   2048,   0,      1,      0.834
> 8192,   2061,   0,      0,      0.863
> 8192,   2061,   0,      1,      0.857
> 8192,   2048,   13,     0,      0.63
> 8192,   2048,   13,     1,      0.629
> 8192,   2061,   13,     0,      0.627
> 8192,   2061,   13,     1,      0.62
> ---
>  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index e6c94dfd02..ceb3b53828 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
>    unsigned int minimum_rep_movsb_threshold;
>  #endif
> -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
>    unsigned int rep_movsb_threshold;
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
>      {
> -      rep_movsb_threshold = 2048 * (64 / 16);
> +      rep_movsb_threshold = 4096 * (64 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 64 * 8;
>  #endif
> @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
>                                     AVX_Fast_Unaligned_Load))
>      {
> -      rep_movsb_threshold = 2048 * (32 / 16);
> +      rep_movsb_threshold = 4096 * (32 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 32 * 8;
>  #endif
>      }
>    else
>      {
> -      rep_movsb_threshold = 2048 * (16 / 16);
> +      rep_movsb_threshold = 4096 * (16 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 16 * 8;
>  #endif
> --
> 2.25.1
>

You need to update comments for x86_rep_movsb_threshold
in sysdeps/x86/dl-tunables.list

-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 17:56     ` H.J. Lu
@ 2021-11-06 18:11       ` Noah Goldstein
  2021-11-06 18:21         ` H.J. Lu
  0 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:11 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > No bug.
> >
> > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > benchmarks the vector copy loop, especially now that it handles 4k
> > aliasing, is better for these medium ranged.
> >
> > On Skylake with ERMS:
> >
> > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > 4096,   0,      0,      0,      0.975
> > 4096,   0,      0,      1,      0.953
> > 4096,   12,     0,      0,      0.969
> > 4096,   12,     0,      1,      0.872
> > 4096,   44,     0,      0,      0.979
> > 4096,   44,     0,      1,      0.83
> > 4096,   0,      12,     0,      1.006
> > 4096,   0,      12,     1,      0.989
> > 4096,   0,      44,     0,      0.739
> > 4096,   0,      44,     1,      0.942
> > 4096,   12,     12,     0,      1.009
> > 4096,   12,     12,     1,      0.973
> > 4096,   44,     44,     0,      0.791
> > 4096,   44,     44,     1,      0.961
> > 4096,   2048,   0,      0,      0.978
> > 4096,   2048,   0,      1,      0.951
> > 4096,   2060,   0,      0,      0.986
> > 4096,   2060,   0,      1,      0.963
> > 4096,   2048,   12,     0,      0.971
> > 4096,   2048,   12,     1,      0.941
> > 4096,   2060,   12,     0,      0.977
> > 4096,   2060,   12,     1,      0.949
> > 8192,   0,      0,      0,      0.85
> > 8192,   0,      0,      1,      0.845
> > 8192,   13,     0,      0,      0.937
> > 8192,   13,     0,      1,      0.939
> > 8192,   45,     0,      0,      0.932
> > 8192,   45,     0,      1,      0.927
> > 8192,   0,      13,     0,      0.621
> > 8192,   0,      13,     1,      0.62
> > 8192,   0,      45,     0,      0.53
> > 8192,   0,      45,     1,      0.516
> > 8192,   13,     13,     0,      0.664
> > 8192,   13,     13,     1,      0.659
> > 8192,   45,     45,     0,      0.593
> > 8192,   45,     45,     1,      0.575
> > 8192,   2048,   0,      0,      0.854
> > 8192,   2048,   0,      1,      0.834
> > 8192,   2061,   0,      0,      0.863
> > 8192,   2061,   0,      1,      0.857
> > 8192,   2048,   13,     0,      0.63
> > 8192,   2048,   13,     1,      0.629
> > 8192,   2061,   13,     0,      0.627
> > 8192,   2061,   13,     1,      0.62
> > ---
> >  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index e6c94dfd02..ceb3b53828 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> >    unsigned int minimum_rep_movsb_threshold;
> >  #endif
> > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
> >    unsigned int rep_movsb_threshold;
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> >      {
> > -      rep_movsb_threshold = 2048 * (64 / 16);
> > +      rep_movsb_threshold = 4096 * (64 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 64 * 8;
> >  #endif
> > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> >                                     AVX_Fast_Unaligned_Load))
> >      {
> > -      rep_movsb_threshold = 2048 * (32 / 16);
> > +      rep_movsb_threshold = 4096 * (32 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 32 * 8;
> >  #endif
> >      }
> >    else
> >      {
> > -      rep_movsb_threshold = 2048 * (16 / 16);
> > +      rep_movsb_threshold = 4096 * (16 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 16 * 8;
> >  #endif
> > --
> > 2.25.1
> >
>
> You need to update comments for x86_rep_movsb_threshold
> in sysdeps/x86/dl-tunables.list

Can do.

Noticing that the original values were based on comparisons with SSE2 likely on
SnB or IVB. I don't have any indication that the 2048 value is not
optimal for those
processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2?
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 18:11       ` Noah Goldstein
@ 2021-11-06 18:21         ` H.J. Lu
  2021-11-06 18:34           ` Noah Goldstein
  0 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 18:21 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Sat, Nov 6, 2021 at 11:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > No bug.
> > >
> > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > benchmarks the vector copy loop, especially now that it handles 4k
> > > aliasing, is better for these medium ranged.
> > >
> > > On Skylake with ERMS:
> > >
> > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > 4096,   0,      0,      0,      0.975
> > > 4096,   0,      0,      1,      0.953
> > > 4096,   12,     0,      0,      0.969
> > > 4096,   12,     0,      1,      0.872
> > > 4096,   44,     0,      0,      0.979
> > > 4096,   44,     0,      1,      0.83
> > > 4096,   0,      12,     0,      1.006
> > > 4096,   0,      12,     1,      0.989
> > > 4096,   0,      44,     0,      0.739
> > > 4096,   0,      44,     1,      0.942
> > > 4096,   12,     12,     0,      1.009
> > > 4096,   12,     12,     1,      0.973
> > > 4096,   44,     44,     0,      0.791
> > > 4096,   44,     44,     1,      0.961
> > > 4096,   2048,   0,      0,      0.978
> > > 4096,   2048,   0,      1,      0.951
> > > 4096,   2060,   0,      0,      0.986
> > > 4096,   2060,   0,      1,      0.963
> > > 4096,   2048,   12,     0,      0.971
> > > 4096,   2048,   12,     1,      0.941
> > > 4096,   2060,   12,     0,      0.977
> > > 4096,   2060,   12,     1,      0.949
> > > 8192,   0,      0,      0,      0.85
> > > 8192,   0,      0,      1,      0.845
> > > 8192,   13,     0,      0,      0.937
> > > 8192,   13,     0,      1,      0.939
> > > 8192,   45,     0,      0,      0.932
> > > 8192,   45,     0,      1,      0.927
> > > 8192,   0,      13,     0,      0.621
> > > 8192,   0,      13,     1,      0.62
> > > 8192,   0,      45,     0,      0.53
> > > 8192,   0,      45,     1,      0.516
> > > 8192,   13,     13,     0,      0.664
> > > 8192,   13,     13,     1,      0.659
> > > 8192,   45,     45,     0,      0.593
> > > 8192,   45,     45,     1,      0.575
> > > 8192,   2048,   0,      0,      0.854
> > > 8192,   2048,   0,      1,      0.834
> > > 8192,   2061,   0,      0,      0.863
> > > 8192,   2061,   0,      1,      0.857
> > > 8192,   2048,   13,     0,      0.63
> > > 8192,   2048,   13,     1,      0.629
> > > 8192,   2061,   13,     0,      0.627
> > > 8192,   2061,   13,     1,      0.62
> > > ---
> > >  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
> > >  1 file changed, 4 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > index e6c94dfd02..ceb3b53828 100644
> > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> > >    unsigned int minimum_rep_movsb_threshold;
> > >  #endif
> > > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
> > >    unsigned int rep_movsb_threshold;
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > +      rep_movsb_threshold = 4096 * (64 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 64 * 8;
> > >  #endif
> > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > >                                     AVX_Fast_Unaligned_Load))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 32 * 8;
> > >  #endif
> > >      }
> > >    else
> > >      {
> > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 16 * 8;
> > >  #endif
> > > --
> > > 2.25.1
> > >
> >
> > You need to update comments for x86_rep_movsb_threshold
> > in sysdeps/x86/dl-tunables.list
>
> Can do.
>
> Noticing that the original values were based on comparisons with SSE2 likely on
> SnB or IVB. I don't have any indication that the 2048 value is not
> optimal for those
> processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2?

Good idea.   So change the threshold to 2048 * (VEC_SIZE / 16) *
(VEC_SIZE / 16)?

-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                   ` (6 preceding siblings ...)
  2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
@ 2021-11-06 18:33 ` Noah Goldstein
  2021-11-06 18:33   ` [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
                     ` (5 more replies)
  7 siblings, 6 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:33 UTC (permalink / raw)
  To: libc-alpha

This commit updates the memcpy tests to test both dst > src and dst <
src. This is because there is logic in the code based on the
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
 string/test-memmove.c |  75 ++++++++++++++++++-
 2 files changed, 214 insertions(+), 28 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index c9e965bed3..3b0f3127b7 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -17,6 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #ifndef MEMCPY_RESULT
+# define DO_EXTRA_TESTS
 # define MEMCPY_RESULT(dst, len) dst
 # define MIN_PAGE_SIZE 131072
 # define TEST_MAIN
@@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 static void
 do_test (size_t align1, size_t align2, size_t len)
 {
-  size_t i, j;
+  size_t i, j, repeats;
   char *s1, *s2;
 
   align1 &= 4095;
@@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
 
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      for (i = 0, j = 1; i < len; i++, j += 23)
+        s1[i] = j;
 
-  for (i = 0, j = 1; i < len; i++, j += 23)
-    s1[i] = j;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (impl, s2, s1, len);
+    }
 }
 
 static void
@@ -212,56 +215,87 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (size_t size)
+do_test1 (size_t align1, size_t align2, size_t size)
 {
   void *large_buf;
-  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  size_t mmap_size, region_size;
+
+  align1 &= (page_size - 1);
+  if (align1 == 0)
+    align1 = page_size;
+
+  align2 &= (page_size - 1);
+  if (align2 == 0)
+    align2 = page_size;
+
+  region_size = (size + page_size - 1) & (~(page_size - 1));
+
+  mmap_size = region_size * 2 + 3 * page_size;
+  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
     {
-      puts ("Failed to allocat large_buf, skipping do_test1");
+      puts ("Failed to allocate large_buf, skipping do_test1");
       return;
     }
-
-  if (mprotect (large_buf + size, page_size, PROT_NONE))
+  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
     error (EXIT_FAILURE, errno, "mprotect failed");
 
-  size_t arrary_size = size / sizeof (uint32_t);
-  uint32_t *dest = large_buf;
-  uint32_t *src = large_buf + size + page_size;
+  size_t array_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf + align1;
+  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
   size_t i;
   size_t repeats;
   for(repeats = 0; repeats < 2; repeats++)
     {
-      for (i = 0; i < arrary_size; i++)
+      for (i = 0; i < array_size; i++)
         src[i] = (uint32_t) i;
-
       FOR_EACH_IMPL (impl, 0)
         {
-            printf ("\t\tRunning: %s\n", impl->name);
           memset (dest, -1, size);
           CALL (impl, (char *) dest, (char *) src, size);
-          for (i = 0; i < arrary_size; i++)
+          for (i = 0; i < array_size; i++)
         if (dest[i] != src[i])
           {
             error (0, 0,
                "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
                impl->name, dest, src, i);
             ret = 1;
-            munmap ((void *) large_buf, size * 2 + page_size);
+            munmap ((void *) large_buf, mmap_size);
             return;
           }
         }
-      dest = src;
-      src = large_buf;
+      dest = large_buf + region_size + 2 * page_size + align1;
+      src = large_buf + align2;
+    }
+  munmap ((void *) large_buf, mmap_size);
+}
+
+static void
+do_random_large_tests (void)
+{
+  size_t i, align1, align2, size;
+  for (i = 0; i < 32; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 0x1000000) + 0x200000;
+      do_test1 (align1, align2, size);
+    }
+
+  for (i = 0; i < 128; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 32768) + 4096;
+      do_test1 (align1, align2, size);
     }
-  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j;
 
   test_init ();
 
@@ -298,6 +332,7 @@ test_main (void)
   for (i = 19; i <= 25; ++i)
     {
       do_test (255, 0, 1 << i);
+      do_test (0, 4000, 1 << i);
       do_test (0, 255, i);
       do_test (0, 4000, i);
     }
@@ -306,8 +341,88 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test1 (0x100000);
-  do_test1 (0x2000000);
+  do_test1 (0, 0, 0x100000);
+  do_test1 (0, 0, 0x2000000);
+
+  for (i = 4096; i < 32768; i += 4096)
+    {
+      for (j = 1; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  for (i = 0x300000; i < 0x2000000; i += 0x235689)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+#ifdef DO_EXTRA_TESTS
+  for (i = 0x200000; i <= 0x2000000; i += i)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+
+          do_test1 (0, j, i + 1);
+          do_test1 (4095, j, i + 1);
+          do_test1 (4096 - j, 0, i + 1);
+
+          do_test1 (0, j - 1, i + 1);
+          do_test1 (4095, j - 1, i + 1);
+          do_test1 (4096 - j - 1, 0, i + 1);
+
+          do_test1 (0, j + 1, i + 1);
+          do_test1 (4095, j + 1, i + 1);
+          do_test1 (4096 - j, 1, i + 1);
+
+          do_test1 (0, j, i - 1);
+          do_test1 (4095, j, i - 1);
+          do_test1 (4096 - j, 0, i - 1);
+
+          do_test1 (0, j - 1, i - 1);
+          do_test1 (4095, j - 1, i - 1);
+          do_test1 (4096 - j - 1, 0, i - 1);
+
+          do_test1 (0, j + 1, i - 1);
+          do_test1 (4095, j + 1, i - 1);
+          do_test1 (4096 - j, 1, i - 1);
+        }
+    }
+#endif
+  do_random_large_tests ();
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index a0ce8b0334..5c6d1579e3 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize() - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize() - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
   munmap ((void *) buf, size);
 }
 
+static void
+do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
+{
+  size_t size, repeats, i;
+  uint8_t *buf, *dst, *src;
+
+  size = bytes_move + MAX(offset1, offset2);
+  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANON, -1, 0);
+
+  if (buf == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  dst = &buf[offset1];
+  src = &buf[offset2];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < bytes_move; i++)
+              src[i] = (uint8_t) i;
+#ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+#else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+#endif
+          for (i = 0; i < bytes_move; i++)
+            {
+              if (dst[i] != (uint8_t) i)
+                {
+                  error (0, 0,
+                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                         impl->name, dst, buf, i);
+                  ret = 1;
+                  break;
+                }
+            }
+        }
+      dst = &buf[offset2];
+      src = &buf[offset1];
+    }
+  munmap ((void *) buf, size);
+}
+
+
 int
 test_main (void)
 {
@@ -395,13 +440,39 @@ test_main (void)
 
   do_random_tests ();
 
+  do_test2 (0);
   do_test2 (33);
+  do_test2 (0x200000 - 1);
   do_test2 (0x200000);
+  do_test2 (0x200000 + 1);
+  do_test2 (0x1000000 - 1);
+  do_test2 (0x1000000);
+  do_test2 (0x1000000 + 1);
   do_test2 (0x4000000 - 1);
   do_test2 (0x4000000);
+  do_test2 (0x4000000 + 1);
 
   /* Copy 16KB data.  */
   do_test3 (16384, 3);
+  for (i = 4096; i <= 16384; i <<= 1)
+    {
+      do_test4 (i, 0, i);
+      do_test4 (i, 0, i - 1);
+      do_test4 (i, 0, i + 1);
+      do_test4 (i, 63, i + 63);
+      do_test4 (i, 63, i + 64);
+      do_test4 (i, 63, i);
+
+      do_test4 (i, 0, 1);
+      do_test4 (i, 0, 15);
+      do_test4 (i, 0, 31);
+      do_test4 (i, 0, 63);
+      do_test4 (i, 0, 64);
+      do_test4 (i, 0, 65);
+      do_test4 (i, 0, 127);
+      do_test4 (i, 0, 129);
+    }
+
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
@ 2021-11-06 18:33   ` Noah Goldstein
  2021-11-06 19:12     ` H.J. Lu
  2021-11-06 18:33   ` [PATCH v4 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:33 UTC (permalink / raw)
  To: libc-alpha

This commit adds more benchmarks for the common memcpy/memmove
benchmarks. The most signifcant cases are the half page offsets. The
current versions leaves dst and src near page aligned which leads to
false 4k aliasing on x86_64. This can add noise due to false
dependencies from one run to the next. As well, this seems like more
of an edge case that common case so it shouldn't be the only thing
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 benchtests/bench-memcpy.c  | 49 +++++++++++++++++++++++++++++++++-----
 benchtests/bench-memmove.c | 26 +++++++++++++++++---
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index d9236a2282..744bea26d3 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -40,7 +40,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, const char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -60,11 +63,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i, j;
   char *s1, *s2;
   size_t repeats;
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -99,7 +102,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  size_t half_page = getpagesize () / 2;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -121,8 +124,15 @@ test_main (void)
     {
       do_test (&json_ctx, 0, 0, 1 << i, 1);
       do_test (&json_ctx, i, 0, 1 << i, 1);
+      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
       do_test (&json_ctx, 0, i, 1 << i, 1);
+      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
       do_test (&json_ctx, i, i, 1 << i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
+      do_test (&json_ctx, half_page, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page, i, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
     }
 
   for (i = 0; i < 32; ++i)
@@ -131,16 +141,26 @@ test_main (void)
       do_test (&json_ctx, i, 0, i, 0);
       do_test (&json_ctx, 0, i, i, 0);
       do_test (&json_ctx, i, i, i, 0);
+      do_test (&json_ctx, half_page, 0, i, 0);
+      do_test (&json_ctx, half_page + i, 0, i, 0);
+      do_test (&json_ctx, half_page, i, i, 0);
+      do_test (&json_ctx, half_page + i, i, i, 0);
+      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
+      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
     }
 
   for (i = 3; i < 32; ++i)
     {
       if ((i & (i - 1)) == 0)
-	continue;
+        continue;
       do_test (&json_ctx, 0, 0, 16 * i, 1);
       do_test (&json_ctx, i, 0, 16 * i, 1);
       do_test (&json_ctx, 0, i, 16 * i, 1);
       do_test (&json_ctx, i, i, 16 * i, 1);
+      do_test (&json_ctx, half_page, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page, i, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
     }
 
   for (i = 32; i < 64; ++i)
@@ -149,16 +169,33 @@ test_main (void)
       do_test (&json_ctx, i, 0, 32 * i, 1);
       do_test (&json_ctx, 0, i, 32 * i, 1);
       do_test (&json_ctx, i, i, 32 * i, 1);
+      do_test (&json_ctx, half_page, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page, i, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
     }
 
   do_test (&json_ctx, 0, 0, getpagesize (), 1);
 
-  for (i = 0; i <= 32; ++i)
+  for (i = 0; i <= 48; ++i)
     {
       do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
     }
 
   json_array_end (&json_ctx);
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index 6becbf4782..855f4d0649 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -34,7 +34,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
-
+  for (i = 0; i < iters / 64; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
@@ -53,11 +56,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -85,6 +88,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
+  size_t half_page = getpagesize () / 2;
 
   test_init ();
 
@@ -138,6 +142,22 @@ test_main (void)
       do_test (&json_ctx, i, i, 32 * i);
     }
 
+  for (i = 0; i <= 48; ++i)
+    {
+      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
+    }
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v4 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-06 18:33   ` [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
@ 2021-11-06 18:33   ` Noah Goldstein
  2021-11-06 19:11     ` H.J. Lu
  2021-11-06 18:33   ` [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:33 UTC (permalink / raw)
  To: libc-alpha

This commit adds a new partial overlap benchmark. This is generally
the most interesting performance case for memmove and was missing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 benchtests/bench-memmove-walk.c | 61 +++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
index b5fdb2a422..2fb484c0ba 100644
--- a/benchtests/bench-memmove-walk.c
+++ b/benchtests/bench-memmove-walk.c
@@ -36,6 +36,10 @@
 # define TIMEOUT (20 * 60)
 # include "bench-string.h"
 
+#define NO_OVERLAP 0
+#define PARTIAL_OVERLAP 1
+#define COMPLETE_OVERLAP 2
+
 IMPL (memmove, 1)
 #endif
 
@@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
+do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
 {
-  json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) len);
-  json_array_begin (json_ctx, "timings");
+  char *s1, *s2, *tmp;
+  size_t repeats;
 
-  if (overlap)
-    buf2 = buf1;
+  s1 = (char *) (buf1);
+  s2 = (char *) (buf2);
+  if (overlap != NO_OVERLAP)
+    s2 = s1;
+  if (overlap == PARTIAL_OVERLAP)
+    s2 += len / 2;
 
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
+    {
+      json_element_object_begin (json_ctx);
+      json_attr_uint (json_ctx, "length", (double) len);
+      json_attr_string(json_ctx, "overlap",
+                       overlap == NO_OVERLAP        ? "none"
+                       : overlap == PARTIAL_OVERLAP ? "partial"
+                                                    : "complete");
+      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
+      json_array_begin (json_ctx, "timings");
+
+
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
 
-  json_array_end (json_ctx);
-  json_element_object_end (json_ctx);
+      json_array_end (json_ctx);
+      json_element_object_end (json_ctx);
+
+      tmp = s1;
+      s1 = s2;
+      s2 = tmp;
+    }
 }
 
 int
@@ -107,15 +131,22 @@ test_main (void)
   /* Non-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, false);
-      do_test (&json_ctx, i + 1, false);
+      do_test (&json_ctx, i, NO_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
+    }
+
+  /* Partially-overlapping buffers.  */
+  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
+    {
+      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
     }
 
-  /* Overlapping buffers.  */
+  /* Complete-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, true);
-      do_test (&json_ctx, i + 1, true);
+      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
+      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
     }
 
   json_array_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
  2021-11-06 18:33   ` [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
  2021-11-06 18:33   ` [PATCH v4 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
@ 2021-11-06 18:33   ` Noah Goldstein
  2021-11-06 19:11     ` H.J. Lu
  2021-11-06 18:33   ` [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:33 UTC (permalink / raw)
  To: libc-alpha

No bug.

The optimizations are as follows:

1) Always align entry to 64 bytes. This makes behavior more
   predictable and makes other frontend optimizations easier.

2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
   significant benefits in the case that:
        0 < (dst - src) < [256, 512]

3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
   improvement and for FSRM [-10%, 25%].

In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/memmove.S                      |   2 +-
 .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
 6 files changed, 381 insertions(+), 224 deletions(-)

diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1f..b2b3180848 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
-
+#define MOV_SIZE	3
 #define SECTION(p)		p
 
 #ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 1ec1962e86..67a55f0c85 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f15..975ae6c051 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 848848ab39..0fa7126830 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 0cbce8f944..88715441fe 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index abde8438d4..7b27cbdda5 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@
 # endif
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
 #ifndef PAGE_SIZE
 # define PAGE_SIZE 4096
 #endif
@@ -199,25 +218,21 @@ L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
+	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
+	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
-	cmpb	$8, %dl
+	cmpl	$8, %edx
 	jae	L(between_8_15)
-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
 	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
 	movb	%cl, (%rdi)
-1:
+L(copy_0):
 	ret
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
+	ret
+#endif
+
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
+	ret
+#endif
+
 #if VEC_SIZE > 32
+	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	-32(%rsi, %rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
+
+	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
 	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER_RETURN
-L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 4
 L(more_8x_vec):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
 L(more_8x_vec_check):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
 
-	.p2align 4
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
 
+	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	.p2align 4
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
-	/* Compute absolute value of difference between source and
-	   destination.  */
-	movq	%rdi, %r9
-	subq	%rsi, %r9
-	movq	%r9, %r8
-	leaq	-1(%r9), %rcx
-	sarq	$63, %r8
-	xorq	%r8, %r9
-	subq	%r8, %r9
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache when
-	   source is loaded.  */
-	cmpq	%r9, %rdx
-	ja	L(more_8x_vec_check)
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
 
 	/* Cache align destination. First store the first 64 bytes then
 	   adjust alignments.  */
-	VMOVU	(%rsi), %VEC(8)
-#if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
-	VMOVU	%VEC(8), (%rdi)
-#if VEC_SIZE < 64
-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
 	/* Adjust source, destination, and size.  */
 	movq	%rdi, %r8
 	andq	$63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
 	/* Adjust length.  */
 	addq	%r8, %rdx
 
-	/* Test if source and destination addresses will alias. If they do
-	   the larger pipeline in large_memcpy_4x alleviated the
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
 	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
 
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
 	/* ecx stores inner loop counter.  */
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 L(loop_large_memcpy_4x_inner):
-	/* Only one prefetch set per page as doing 4 pages give more time
-	   for prefetcher to keep up.  */
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                     ` (2 preceding siblings ...)
  2021-11-06 18:33   ` [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-11-06 18:33   ` Noah Goldstein
  2021-11-06 19:10     ` H.J. Lu
  2021-11-06 19:12   ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
  2021-12-07 21:10   ` Stafford Horne
  5 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:33 UTC (permalink / raw)
  To: libc-alpha

No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62
---
 sysdeps/x86/dl-cacheinfo.h   |  8 +++++---
 sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd02..2e43e67e4f 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
+     threshold is 2048 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index dd6e1d65c9..419313804d 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
     }
     x86_rep_movsb_threshold {
       type: SIZE_T
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
-      # shows that 2KB is the approximate value above which REP MOVSB
-      # becomes faster than SSE2 optimization on processors with Enhanced
-      # REP MOVSB.  Since larger register size can move more data with a
-      # single load and store, the threshold is higher with larger register
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
-      # times of vector size and the default value is 2048 * (vector size
-      # / 16), the default value and the minimum value must be updated at
-      # run-time.  NB: Don't set the default value since we can't tell if
-      # the tunable value is set by user or not [BZ #27069].
+      # Since there is overhead to set up REP MOVSB operation, REP
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
+      # in glibc shows that 2KB is the approximate value above which
+      # REP MOVSB becomes faster than SSE2 optimization on processors
+      # with Enhanced REP MOVSB.  Since larger register size can move
+      # more data with a single load and store, the threshold is
+      # higher with larger register size.  Micro benchmarks show AVX
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
+      # threshold is universally set at 2112 bytes.  Note: Since the
+      # REP MOVSB threshold must be greater than 8 times of vector
+      # size and the default value is 4096 * (vector size / 16), the
+      # default value and the minimum value must be updated at
+      # run-time.  NB: Don't set the default value since we can't tell
+      # if the tunable value is set by user or not [BZ #27069].
       minval: 1
     }
     x86_rep_stosb_threshold {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 18:21         ` H.J. Lu
@ 2021-11-06 18:34           ` Noah Goldstein
  0 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 18:34 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Sat, Nov 6, 2021 at 1:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 11:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > No bug.
> > > >
> > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > > benchmarks the vector copy loop, especially now that it handles 4k
> > > > aliasing, is better for these medium ranged.
> > > >
> > > > On Skylake with ERMS:
> > > >
> > > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > > 4096,   0,      0,      0,      0.975
> > > > 4096,   0,      0,      1,      0.953
> > > > 4096,   12,     0,      0,      0.969
> > > > 4096,   12,     0,      1,      0.872
> > > > 4096,   44,     0,      0,      0.979
> > > > 4096,   44,     0,      1,      0.83
> > > > 4096,   0,      12,     0,      1.006
> > > > 4096,   0,      12,     1,      0.989
> > > > 4096,   0,      44,     0,      0.739
> > > > 4096,   0,      44,     1,      0.942
> > > > 4096,   12,     12,     0,      1.009
> > > > 4096,   12,     12,     1,      0.973
> > > > 4096,   44,     44,     0,      0.791
> > > > 4096,   44,     44,     1,      0.961
> > > > 4096,   2048,   0,      0,      0.978
> > > > 4096,   2048,   0,      1,      0.951
> > > > 4096,   2060,   0,      0,      0.986
> > > > 4096,   2060,   0,      1,      0.963
> > > > 4096,   2048,   12,     0,      0.971
> > > > 4096,   2048,   12,     1,      0.941
> > > > 4096,   2060,   12,     0,      0.977
> > > > 4096,   2060,   12,     1,      0.949
> > > > 8192,   0,      0,      0,      0.85
> > > > 8192,   0,      0,      1,      0.845
> > > > 8192,   13,     0,      0,      0.937
> > > > 8192,   13,     0,      1,      0.939
> > > > 8192,   45,     0,      0,      0.932
> > > > 8192,   45,     0,      1,      0.927
> > > > 8192,   0,      13,     0,      0.621
> > > > 8192,   0,      13,     1,      0.62
> > > > 8192,   0,      45,     0,      0.53
> > > > 8192,   0,      45,     1,      0.516
> > > > 8192,   13,     13,     0,      0.664
> > > > 8192,   13,     13,     1,      0.659
> > > > 8192,   45,     45,     0,      0.593
> > > > 8192,   45,     45,     1,      0.575
> > > > 8192,   2048,   0,      0,      0.854
> > > > 8192,   2048,   0,      1,      0.834
> > > > 8192,   2061,   0,      0,      0.863
> > > > 8192,   2061,   0,      1,      0.857
> > > > 8192,   2048,   13,     0,      0.63
> > > > 8192,   2048,   13,     1,      0.629
> > > > 8192,   2061,   13,     0,      0.627
> > > > 8192,   2061,   13,     1,      0.62
> > > > ---
> > > >  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
> > > >  1 file changed, 4 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > > index e6c94dfd02..ceb3b53828 100644
> > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> > > >    unsigned int minimum_rep_movsb_threshold;
> > > >  #endif
> > > > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > > > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
> > > >    unsigned int rep_movsb_threshold;
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > > +      rep_movsb_threshold = 4096 * (64 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 64 * 8;
> > > >  #endif
> > > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > > >                                     AVX_Fast_Unaligned_Load))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 32 * 8;
> > > >  #endif
> > > >      }
> > > >    else
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 16 * 8;
> > > >  #endif
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > You need to update comments for x86_rep_movsb_threshold
> > > in sysdeps/x86/dl-tunables.list
> >
> > Can do.
> >
> > Noticing that the original values were based on comparisons with SSE2 likely on
> > SnB or IVB. I don't have any indication that the 2048 value is not
> > optimal for those
> > processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2?
>
> Good idea.   So change the threshold to 2048 * (VEC_SIZE / 16) *
> (VEC_SIZE / 16)?

Done and updated the comments explaining the thresholds.

>
> --
> H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 18:33   ` [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
@ 2021-11-06 19:10     ` H.J. Lu
  2022-04-23  1:42       ` Sunil Pandey
  0 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 19:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Sat, Nov 6, 2021 at 11:36 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> No bug.
>
> This patch doubles the rep_movsb_threshold when using ERMS. Based on
> benchmarks the vector copy loop, especially now that it handles 4k
> aliasing, is better for these medium ranged.
>
> On Skylake with ERMS:
>
> Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> 4096,   0,      0,      0,      0.975
> 4096,   0,      0,      1,      0.953
> 4096,   12,     0,      0,      0.969
> 4096,   12,     0,      1,      0.872
> 4096,   44,     0,      0,      0.979
> 4096,   44,     0,      1,      0.83
> 4096,   0,      12,     0,      1.006
> 4096,   0,      12,     1,      0.989
> 4096,   0,      44,     0,      0.739
> 4096,   0,      44,     1,      0.942
> 4096,   12,     12,     0,      1.009
> 4096,   12,     12,     1,      0.973
> 4096,   44,     44,     0,      0.791
> 4096,   44,     44,     1,      0.961
> 4096,   2048,   0,      0,      0.978
> 4096,   2048,   0,      1,      0.951
> 4096,   2060,   0,      0,      0.986
> 4096,   2060,   0,      1,      0.963
> 4096,   2048,   12,     0,      0.971
> 4096,   2048,   12,     1,      0.941
> 4096,   2060,   12,     0,      0.977
> 4096,   2060,   12,     1,      0.949
> 8192,   0,      0,      0,      0.85
> 8192,   0,      0,      1,      0.845
> 8192,   13,     0,      0,      0.937
> 8192,   13,     0,      1,      0.939
> 8192,   45,     0,      0,      0.932
> 8192,   45,     0,      1,      0.927
> 8192,   0,      13,     0,      0.621
> 8192,   0,      13,     1,      0.62
> 8192,   0,      45,     0,      0.53
> 8192,   0,      45,     1,      0.516
> 8192,   13,     13,     0,      0.664
> 8192,   13,     13,     1,      0.659
> 8192,   45,     45,     0,      0.593
> 8192,   45,     45,     1,      0.575
> 8192,   2048,   0,      0,      0.854
> 8192,   2048,   0,      1,      0.834
> 8192,   2061,   0,      0,      0.863
> 8192,   2061,   0,      1,      0.857
> 8192,   2048,   13,     0,      0.63
> 8192,   2048,   13,     1,      0.629
> 8192,   2061,   13,     0,      0.627
> 8192,   2061,   13,     1,      0.62
> ---
>  sysdeps/x86/dl-cacheinfo.h   |  8 +++++---
>  sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
>  2 files changed, 20 insertions(+), 14 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index e6c94dfd02..2e43e67e4f 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
>    unsigned int minimum_rep_movsb_threshold;
>  #endif
> -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
> +     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
> +     threshold is 2048 * (VEC_SIZE / 16).  */
>    unsigned int rep_movsb_threshold;
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
>      {
> -      rep_movsb_threshold = 2048 * (64 / 16);
> +      rep_movsb_threshold = 4096 * (64 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 64 * 8;
>  #endif
> @@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
>                                     AVX_Fast_Unaligned_Load))
>      {
> -      rep_movsb_threshold = 2048 * (32 / 16);
> +      rep_movsb_threshold = 4096 * (32 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 32 * 8;
>  #endif
> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> index dd6e1d65c9..419313804d 100644
> --- a/sysdeps/x86/dl-tunables.list
> +++ b/sysdeps/x86/dl-tunables.list
> @@ -32,17 +32,21 @@ glibc {
>      }
>      x86_rep_movsb_threshold {
>        type: SIZE_T
> -      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
> -      # isn't faster on short data.  The memcpy micro benchmark in glibc
> -      # shows that 2KB is the approximate value above which REP MOVSB
> -      # becomes faster than SSE2 optimization on processors with Enhanced
> -      # REP MOVSB.  Since larger register size can move more data with a
> -      # single load and store, the threshold is higher with larger register
> -      # size.  Note: Since the REP MOVSB threshold must be greater than 8
> -      # times of vector size and the default value is 2048 * (vector size
> -      # / 16), the default value and the minimum value must be updated at
> -      # run-time.  NB: Don't set the default value since we can't tell if
> -      # the tunable value is set by user or not [BZ #27069].
> +      # Since there is overhead to set up REP MOVSB operation, REP
> +      # MOVSB isn't faster on short data.  The memcpy micro benchmark
> +      # in glibc shows that 2KB is the approximate value above which
> +      # REP MOVSB becomes faster than SSE2 optimization on processors
> +      # with Enhanced REP MOVSB.  Since larger register size can move
> +      # more data with a single load and store, the threshold is
> +      # higher with larger register size.  Micro benchmarks show AVX
> +      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
> +      # threshold is extrapolated to 16KB.  For machines with FSRM the
> +      # threshold is universally set at 2112 bytes.  Note: Since the
> +      # REP MOVSB threshold must be greater than 8 times of vector
> +      # size and the default value is 4096 * (vector size / 16), the
> +      # default value and the minimum value must be updated at
> +      # run-time.  NB: Don't set the default value since we can't tell
> +      # if the tunable value is set by user or not [BZ #27069].
>        minval: 1
>      }
>      x86_rep_stosb_threshold {
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-06 18:33   ` [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-11-06 19:11     ` H.J. Lu
  2022-04-23  1:41       ` Sunil Pandey
  0 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 19:11 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> The optimizations are as follows:
>
> 1) Always align entry to 64 bytes. This makes behavior more
>    predictable and makes other frontend optimizations easier.
>
> 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
>    significant benefits in the case that:
>         0 < (dst - src) < [256, 512]
>
> 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
>    improvement and for FSRM [-10%, 25%].
>
> In addition to these primary changes there is general cleanup
> throughout to optimize the aligning routines and control flow logic.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  sysdeps/x86_64/memmove.S                      |   2 +-
>  .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
>  .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
>  .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
>  .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
>  6 files changed, 381 insertions(+), 224 deletions(-)
>
> diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> index db106a7a1f..b2b3180848 100644
> --- a/sysdeps/x86_64/memmove.S
> +++ b/sysdeps/x86_64/memmove.S
> @@ -25,7 +25,7 @@
>  /* Use movups and movaps for smaller code sizes.  */
>  #define VMOVU          movups
>  #define VMOVA          movaps
> -
> +#define MOV_SIZE       3
>  #define SECTION(p)             p
>
>  #ifdef USE_MULTIARCH
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 1ec1962e86..67a55f0c85 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT                vmovntdq
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
> -
> +# define MOV_SIZE      4
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index e195e93f15..975ae6c051 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT                vmovntdq
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
> -
> +# define MOV_SIZE      4
>  # define SECTION(p)            p##.avx
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 848848ab39..0fa7126830 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE      6
>  # define SECTION(p)            p##.evex512
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx512_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 0cbce8f944..88715441fe 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE      6
>  # define SECTION(p)            p##.evex
>  # define MEMMOVE_SYMBOL(p,s)   p##_evex_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index abde8438d4..7b27cbdda5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -76,6 +76,25 @@
>  # endif
>  #endif
>
> +/* Whether to align before movsb. Ultimately we want 64 byte
> +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> +#define ALIGN_MOVSB    (VEC_SIZE > 16)
> +/* Number of bytes to align movsb to.  */
> +#define MOVSB_ALIGN_TO 64
> +
> +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> +
> +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> +# error MOV_SIZE Unknown
> +#endif
> +
> +#if LARGE_MOV_SIZE
> +# define SMALL_SIZE_OFFSET     (4)
> +#else
> +# define SMALL_SIZE_OFFSET     (0)
> +#endif
> +
>  #ifndef PAGE_SIZE
>  # define PAGE_SIZE 4096
>  #endif
> @@ -199,25 +218,21 @@ L(start):
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       /* Load regardless.  */
> +       VMOVU   (%rsi), %VEC(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(more_2x_vec)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(last_2x_vec):
> -#endif
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   (%rsi), %VEC(0)
>         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> -       ret
> +#if !(defined USE_MULTIARCH && IS_IN (libc))
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
>         VZEROUPPER_RETURN
>  #endif
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMMOVE_SYMBOL (__memmove, unaligned))
> -
>  # if VEC_SIZE == 16
>  ENTRY (__mempcpy_chk_erms)
>         cmp     %RDX_LP, %RCX_LP
> @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  # endif
>
> -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
>         movq    %rdi, %rax
>  L(start_erms):
>  # ifdef __ILP32__
> @@ -298,310 +313,448 @@ L(start_erms):
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       /* Load regardless.  */
> +       VMOVU   (%rsi), %VEC(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(movsb_more_2x_vec)
> -L(last_2x_vec):
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> +        */
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
>  L(return):
> -#if VEC_SIZE > 16
> +# if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
> -#else
> +# else
>         ret
> +# endif
>  #endif
>
> -L(movsb):
> -       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> -       jae     L(more_8x_vec)
> -       cmpq    %rsi, %rdi
> -       jb      1f
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       leaq    (%rsi,%rdx), %r9
> -       cmpq    %r9, %rdi
> -       /* Avoid slow backward REP MOVSB.  */
> -       jb      L(more_8x_vec_backward)
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rdi, %rcx
> -       subq    %rsi, %rcx
> -       jmp     2f
> -# endif
> -1:
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rsi, %rcx
> -       subq    %rdi, %rcx
> -2:
> -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> -   is N*4GB + [1..63] with N >= 0.  */
> -       cmpl    $63, %ecx
> -       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> -3:
> -# endif
> -       mov     %RDX_LP, %RCX_LP
> -       rep movsb
> -L(nop):
> +#if LARGE_MOV_SIZE
> +       /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> +          ENTRY block and L(less_vec).  */
> +       .p2align 4,, 8
> +L(between_4_7):
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       movl    (%rsi), %ecx
> +       movl    (%rsi, %rdx), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, (%rdi, %rdx)
>         ret
>  #endif
>
> +       .p2align 4
>  L(less_vec):
>         /* Less than 1 VEC.  */
>  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  # error Unsupported VEC_SIZE!
>  #endif
>  #if VEC_SIZE > 32
> -       cmpb    $32, %dl
> +       cmpl    $32, %edx
>         jae     L(between_32_63)
>  #endif
>  #if VEC_SIZE > 16
> -       cmpb    $16, %dl
> +       cmpl    $16, %edx
>         jae     L(between_16_31)
>  #endif
> -       cmpb    $8, %dl
> +       cmpl    $8, %edx
>         jae     L(between_8_15)
> -       cmpb    $4, %dl
> +#if SMALL_MOV_SIZE
> +       cmpl    $4, %edx
> +#else
> +       subq    $4, %rdx
> +#endif
>         jae     L(between_4_7)
> -       cmpb    $1, %dl
> -       ja      L(between_2_3)
> -       jb      1f
> -       movzbl  (%rsi), %ecx
> +       cmpl    $(1 - SMALL_SIZE_OFFSET), %edx
> +       jl      L(copy_0)
> +       movb    (%rsi), %cl
> +       je      L(copy_1)
> +       movzwl  (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> +       movw    %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> +L(copy_1):
>         movb    %cl, (%rdi)
> -1:
> +L(copy_0):
>         ret
> +
> +#if SMALL_MOV_SIZE
> +       .p2align 4,, 8
> +L(between_4_7):
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       movl    -4(%rsi, %rdx), %ecx
> +       movl    (%rsi), %esi
> +       movl    %ecx, -4(%rdi, %rdx)
> +       movl    %esi, (%rdi)
> +       ret
> +#endif
> +
> +#if VEC_SIZE > 16
> +       /* From 16 to 31.  No branch when size == 16.  */
> +       .p2align 4,, 8
> +L(between_16_31):
> +       vmovdqu (%rsi), %xmm0
> +       vmovdqu -16(%rsi, %rdx), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -16(%rdi, %rdx)
> +       /* No ymm registers have been touched.  */
> +       ret
> +#endif
> +
>  #if VEC_SIZE > 32
> +       .p2align 4,, 10
>  L(between_32_63):
>         /* From 32 to 63.  No branch when size == 32.  */
>         VMOVU   (%rsi), %YMM0
> -       VMOVU   -32(%rsi,%rdx), %YMM1
> +       VMOVU   -32(%rsi, %rdx), %YMM1
>         VMOVU   %YMM0, (%rdi)
> -       VMOVU   %YMM1, -32(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -#endif
> -#if VEC_SIZE > 16
> -       /* From 16 to 31.  No branch when size == 16.  */
> -L(between_16_31):
> -       VMOVU   (%rsi), %XMM0
> -       VMOVU   -16(%rsi,%rdx), %XMM1
> -       VMOVU   %XMM0, (%rdi)
> -       VMOVU   %XMM1, -16(%rdi,%rdx)
> +       VMOVU   %YMM1, -32(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
> +
> +       .p2align 4,, 10
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
> -       movq    -8(%rsi,%rdx), %rcx
> +       movq    -8(%rsi, %rdx), %rcx
>         movq    (%rsi), %rsi
> -       movq    %rcx, -8(%rdi,%rdx)
>         movq    %rsi, (%rdi)
> +       movq    %rcx, -8(%rdi, %rdx)
>         ret
> -L(between_4_7):
> -       /* From 4 to 7.  No branch when size == 4.  */
> -       movl    -4(%rsi,%rdx), %ecx
> -       movl    (%rsi), %esi
> -       movl    %ecx, -4(%rdi,%rdx)
> -       movl    %esi, (%rdi)
> -       ret
> -L(between_2_3):
> -       /* From 2 to 3.  No branch when size == 2.  */
> -       movzwl  -2(%rsi,%rdx), %ecx
> -       movzwl  (%rsi), %esi
> -       movw    %cx, -2(%rdi,%rdx)
> -       movw    %si, (%rdi)
> -       ret
>
> +       .p2align 4,, 10
> +L(last_4x_vec):
> +       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> +
> +       /* VEC(0) and VEC(1) have already been loaded.  */
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  L(movsb_more_2x_vec):
>         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
>         ja      L(movsb)
>  #endif
>  L(more_2x_vec):
> -       /* More than 2 * VEC and there may be overlap between destination
> -          and source.  */
> +       /* More than 2 * VEC and there may be overlap between
> +          destination and source.  */
>         cmpq    $(VEC_SIZE * 8), %rdx
>         ja      L(more_8x_vec)
> +       /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_4x_vec)
> -       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), VEC_SIZE(%rdi)
>         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -L(last_4x_vec):
> -       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> +       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 4
>  L(more_8x_vec):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward REP MOVSB is slow and we don't want to use NT stores if
> +          there is overlap.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
>         /* Check if non-temporal move candidate.  */
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>         /* Check non-temporal store threshold.  */
> -       cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
>         ja      L(large_memcpy_2x)
>  #endif
> -       /* Entry if rdx is greater than non-temporal threshold but there
> -       is overlap.  */
> +       /* To reach this point there cannot be overlap and dst > src. So
> +          check for overlap and src > dst in which case correctness
> +          requires forward copy. Otherwise decide between backward/forward
> +          copy depending on address aliasing.  */
> +
> +       /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> +          but less than __x86_shared_non_temporal_threshold.  */
>  L(more_8x_vec_check):
> -       cmpq    %rsi, %rdi
> -       ja      L(more_8x_vec_backward)
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       /* Load the first VEC and last 4 * VEC to support overlapping
> -          addresses.  */
> -       VMOVU   (%rsi), %VEC(4)
> +       /* rcx contains dst - src. Add back length (rdx).  */
> +       leaq    (%rcx, %rdx), %r8
> +       /* If r8 has different sign than rcx then there is overlap so we
> +          must do forward copy.  */
> +       xorq    %rcx, %r8
> +       /* Isolate just sign bit of r8.  */
> +       shrq    $63, %r8
> +       /* Get 4k difference dst - src.  */
> +       andl    $(PAGE_SIZE - 256), %ecx
> +       /* If r8 is non-zero must do foward for correctness. Otherwise
> +          if ecx is non-zero there is 4k False Alaising so do backward
> +          copy.  */
> +       addl    %r8d, %ecx
> +       jz      L(more_8x_vec_backward)
> +
> +       /* if rdx is greater than __x86_shared_non_temporal_threshold
> +          but there is overlap, or from short distance movsb.  */
> +L(more_8x_vec_forward):
> +       /* Load first and last 4 * VEC to support overlapping addresses.
> +        */
> +
> +       /* First vec was already loaded into VEC(0).  */
>         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
>         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> +       /* Save begining of dst.  */
> +       movq    %rdi, %rcx
> +       /* Align dst to VEC_SIZE - 1.  */
> +       orq     $(VEC_SIZE - 1), %rdi
>         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
>         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> -       /* Save start and stop of the destination buffer.  */
> -       movq    %rdi, %r11
> -       leaq    -VEC_SIZE(%rdi, %rdx), %rcx
> -       /* Align destination for aligned stores in the loop.  Compute
> -          how much destination is misaligned.  */
> -       movq    %rdi, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Get the negative of offset for alignment.  */
> -       subq    $VEC_SIZE, %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rsi
> -       /* Adjust destination which should be aligned now.  */
> -       subq    %r8, %rdi
> -       /* Adjust length.  */
> -       addq    %r8, %rdx
>
> -       .p2align 4
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rcx, %rsi
> +       /* Finish aligning dst.  */
> +       incq    %rdi
> +       /* Restore src adjusted with new value for aligned dst.  */
> +       addq    %rdi, %rsi
> +       /* Store end of buffer minus tail in rdx.  */
> +       leaq    (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> +
> +       /* Dont use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
>         subq    $-(VEC_SIZE * 4), %rsi
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VEC(1), (%rdi)
> +       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> +       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
> -       cmpq    $(VEC_SIZE * 4), %rdx
> +       cmpq    %rdi, %rdx
>         ja      L(loop_4x_vec_forward)
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (%rcx)
> -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> +       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> +       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> +       VMOVU   %VEC(8), (%rdx)
>         /* Store the first VEC.  */
> -       VMOVU   %VEC(4), (%r11)
> +       VMOVU   %VEC(0), (%rcx)
> +       /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> +        */
> +L(nop_backward):
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 8
> +L(more_8x_vec_backward_check_nop):
> +       /* rcx contains dst - src. Test for dst == src to skip all of
> +          memmove.  */
> +       testq   %rcx, %rcx
> +       jz      L(nop_backward)
>  L(more_8x_vec_backward):
>         /* Load the first 4 * VEC and last VEC to support overlapping
>            addresses.  */
> -       VMOVU   (%rsi), %VEC(4)
> +
> +       /* First vec was also loaded into VEC(0).  */
>         VMOVU   VEC_SIZE(%rsi), %VEC(5)
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> +       /* Begining of region for 4x backward copy stored in rcx.  */
> +       leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
> -       /* Save stop of the destination buffer.  */
> -       leaq    -VEC_SIZE(%rdi, %rdx), %r11
> -       /* Align destination end for aligned stores in the loop.  Compute
> -          how much destination end is misaligned.  */
> -       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
> -       movq    %r11, %r9
> -       movq    %r11, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rcx
> -       /* Adjust the end of destination which should be aligned now.  */
> -       subq    %r8, %r9
> -       /* Adjust length.  */
> -       subq    %r8, %rdx
> -
> -       .p2align 4
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Align dst.  */
> +       andq    $-(VEC_SIZE), %rcx
> +       /* Restore src.  */
> +       addq    %rcx, %rsi
> +
> +       /* Don't use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
> -       VMOVU   (%rcx), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%r9)
> -       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> -       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> -       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> -       addq    $-(VEC_SIZE * 4), %r9
> -       cmpq    $(VEC_SIZE * 4), %rdx
> -       ja      L(loop_4x_vec_backward)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> +       addq    $(VEC_SIZE * -4), %rsi
> +       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> +       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> +       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> +       addq    $(VEC_SIZE * -4), %rcx
> +       cmpq    %rcx, %rdi
> +       jb      L(loop_4x_vec_backward)
>         /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(4), (%rdi)
> +       VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(5), VEC_SIZE(%rdi)
>         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
>         /* Store the last VEC.  */
> -       VMOVU   %VEC(8), (%r11)
> +       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> +       VZEROUPPER_RETURN
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> +       /* L(skip_short_movsb_check) is only used with ERMS. Not for
> +          FSRM.  */
> +       .p2align 5,, 16
> +# if ALIGN_MOVSB
> +L(skip_short_movsb_check):
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* If CPU does not have FSRM two options for aligning. Align src
> +          if dst and src 4k alias. Otherwise align dst.  */
> +       testl   $(PAGE_SIZE - 512), %ecx
> +       jnz     L(movsb_align_dst)
> +       /* Fall through. dst and src 4k alias. It's better to align src
> +          here because the bottleneck will be loads dues to the false
> +          dependency on dst.  */
> +
> +       /* rcx already has dst - src.  */
> +       movq    %rcx, %r9
> +       /* Add src to len. Subtract back after src aligned. -1 because
> +          src is initially aligned to MOVSB_ALIGN_TO - 1.  */
> +       leaq    -1(%rsi, %rdx), %rcx
> +       /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> +       orq     $(MOVSB_ALIGN_TO - 1), %rsi
> +       /* Restore dst and len adjusted with new values for aligned dst.
> +        */
> +       leaq    1(%rsi, %r9), %rdi
> +       subq    %rsi, %rcx
> +       /* Finish aligning src.  */
> +       incq    %rsi
> +
> +       rep     movsb
> +
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
>         VZEROUPPER_RETURN
> +# endif
> +
> +       .p2align 4,, 12
> +L(movsb):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward REP MOVSB is slow and we don't want to use NT stores if
> +          there is overlap.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
> +# if ALIGN_MOVSB
> +       /* Save dest for storing aligning VECs later.  */
> +       movq    %rdi, %r8
> +# endif
> +       /* If above __x86_rep_movsb_stop_threshold most likely is
> +          candidate for NT moves aswell.  */
> +       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> +       jae     L(large_memcpy_2x_check)
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> +       /* Only avoid short movsb if CPU has FSRM.  */
> +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +       jz      L(skip_short_movsb_check)
> +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       /* Avoid "rep movsb" if RCX, the distance between source and
> +          destination, is N*4GB + [1..63] with N >= 0.  */
> +
> +       /* ecx contains dst - src. Early check for backward copy
> +          conditions means only case of slow movsb with src = dst + [0,
> +          63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> +          for that case.  */
> +       cmpl    $-64, %ecx
> +       ja      L(more_8x_vec_forward)
> +#  endif
> +# endif
> +# if ALIGN_MOVSB
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* Fall through means cpu has FSRM. In that case exclusively
> +          align destination.  */
> +L(movsb_align_dst):
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> +       addq    $(MOVSB_ALIGN_TO - 1), %rdi
> +       /* Add dst to len. Subtract back after dst aligned.  */
> +       leaq    (%r8, %rdx), %rcx
> +       /* Finish aligning dst.  */
> +       andq    $-(MOVSB_ALIGN_TO), %rdi
> +       /* Restore src and len adjusted with new values for aligned dst.
> +        */
> +       addq    %rdi, %rsi
> +       subq    %rdi, %rcx
> +
> +       rep     movsb
> +
> +       /* Store VECs loaded for aligning.  */
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
> +       VZEROUPPER_RETURN
> +# else /* !ALIGN_MOVSB.  */
> +L(skip_short_movsb_check):
> +       mov     %RDX_LP, %RCX_LP
> +       rep     movsb
> +       ret
> +# endif
> +#endif
>
> +       .p2align 4,, 10
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -       .p2align 4
> +L(large_memcpy_2x_check):
> +       cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> +       jb      L(more_8x_vec_check)
>  L(large_memcpy_2x):
> -       /* Compute absolute value of difference between source and
> -          destination.  */
> -       movq    %rdi, %r9
> -       subq    %rsi, %r9
> -       movq    %r9, %r8
> -       leaq    -1(%r9), %rcx
> -       sarq    $63, %r8
> -       xorq    %r8, %r9
> -       subq    %r8, %r9
> -       /* Don't use non-temporal store if there is overlap between
> -          destination and source since destination may be in cache when
> -          source is loaded.  */
> -       cmpq    %r9, %rdx
> -       ja      L(more_8x_vec_check)
> +       /* To reach this point it is impossible for dst > src and
> +          overlap. Remaining to check is src > dst and overlap. rcx
> +          already contains dst - src. Negate rcx to get src - dst. If
> +          length > rcx then there is overlap and forward copy is best.  */
> +       negq    %rcx
> +       cmpq    %rcx, %rdx
> +       ja      L(more_8x_vec_forward)
>
>         /* Cache align destination. First store the first 64 bytes then
>            adjust alignments.  */
> -       VMOVU   (%rsi), %VEC(8)
> -#if VEC_SIZE < 64
> -       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> -#if VEC_SIZE < 32
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> -#endif
> -#endif
> -       VMOVU   %VEC(8), (%rdi)
> -#if VEC_SIZE < 64
> -       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> -#if VEC_SIZE < 32
> -       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> -#endif
> -#endif
> +
> +       /* First vec was also loaded into VEC(0).  */
> +# if VEC_SIZE < 64
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  if VEC_SIZE < 32
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +#  endif
> +# endif
> +       VMOVU   %VEC(0), (%rdi)
> +# if VEC_SIZE < 64
> +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +#  if VEC_SIZE < 32
> +       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +#  endif
> +# endif
> +
>         /* Adjust source, destination, and size.  */
>         movq    %rdi, %r8
>         andq    $63, %r8
> @@ -614,9 +767,13 @@ L(large_memcpy_2x):
>         /* Adjust length.  */
>         addq    %r8, %rdx
>
> -       /* Test if source and destination addresses will alias. If they do
> -          the larger pipeline in large_memcpy_4x alleviated the
> +       /* Test if source and destination addresses will alias. If they
> +          do the larger pipeline in large_memcpy_4x alleviated the
>            performance drop.  */
> +
> +       /* ecx contains -(dst - src). not ecx will return dst - src - 1
> +          which works for testing aliasing.  */
> +       notl    %ecx
>         testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
>         jz      L(large_memcpy_4x)
>
> @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
>         /* ecx stores inner loop counter.  */
>         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
>  L(loop_large_memcpy_4x_inner):
> -       /* Only one prefetch set per page as doing 4 pages give more time
> -          for prefetcher to keep up.  */
> +       /* Only one prefetch set per page as doing 4 pages give more
> +          time for prefetcher to keep up.  */
>         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-11-06 18:33   ` [PATCH v4 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
@ 2021-11-06 19:11     ` H.J. Lu
  0 siblings, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 19:11 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds a new partial overlap benchmark. This is generally
> the most interesting performance case for memmove and was missing.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  benchtests/bench-memmove-walk.c | 61 +++++++++++++++++++++++++--------
>  1 file changed, 46 insertions(+), 15 deletions(-)
>
> diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
> index b5fdb2a422..2fb484c0ba 100644
> --- a/benchtests/bench-memmove-walk.c
> +++ b/benchtests/bench-memmove-walk.c
> @@ -36,6 +36,10 @@
>  # define TIMEOUT (20 * 60)
>  # include "bench-string.h"
>
> +#define NO_OVERLAP 0
> +#define PARTIAL_OVERLAP 1
> +#define COMPLETE_OVERLAP 2
> +
>  IMPL (memmove, 1)
>  #endif
>
> @@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  }
>
>  static void
> -do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
> +do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
>  {
> -  json_element_object_begin (json_ctx);
> -  json_attr_uint (json_ctx, "length", (double) len);
> -  json_array_begin (json_ctx, "timings");
> +  char *s1, *s2, *tmp;
> +  size_t repeats;
>
> -  if (overlap)
> -    buf2 = buf1;
> +  s1 = (char *) (buf1);
> +  s2 = (char *) (buf2);
> +  if (overlap != NO_OVERLAP)
> +    s2 = s1;
> +  if (overlap == PARTIAL_OVERLAP)
> +    s2 += len / 2;
>
> -  FOR_EACH_IMPL (impl, 0)
> -    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
> +  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
> +    {
> +      json_element_object_begin (json_ctx);
> +      json_attr_uint (json_ctx, "length", (double) len);
> +      json_attr_string(json_ctx, "overlap",
> +                       overlap == NO_OVERLAP        ? "none"
> +                       : overlap == PARTIAL_OVERLAP ? "partial"
> +                                                    : "complete");
> +      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
> +      json_array_begin (json_ctx, "timings");
> +
> +
> +      FOR_EACH_IMPL (impl, 0)
> +        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
>
> -  json_array_end (json_ctx);
> -  json_element_object_end (json_ctx);
> +      json_array_end (json_ctx);
> +      json_element_object_end (json_ctx);
> +
> +      tmp = s1;
> +      s1 = s2;
> +      s2 = tmp;
> +    }
>  }
>
>  int
> @@ -107,15 +131,22 @@ test_main (void)
>    /* Non-overlapping buffers.  */
>    for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
>      {
> -      do_test (&json_ctx, i, false);
> -      do_test (&json_ctx, i + 1, false);
> +      do_test (&json_ctx, i, NO_OVERLAP, 1);
> +      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
> +    }
> +
> +  /* Partially-overlapping buffers.  */
> +  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
> +    {
> +      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
> +      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
>      }
>
> -  /* Overlapping buffers.  */
> +  /* Complete-overlapping buffers.  */
>    for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
>      {
> -      do_test (&json_ctx, i, true);
> -      do_test (&json_ctx, i + 1, true);
> +      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
> +      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
>      }
>
>    json_array_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-11-06 18:33   ` [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
@ 2021-11-06 19:12     ` H.J. Lu
  0 siblings, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 19:12 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds more benchmarks for the common memcpy/memmove
> benchmarks. The most signifcant cases are the half page offsets. The
> current versions leaves dst and src near page aligned which leads to
> false 4k aliasing on x86_64. This can add noise due to false
> dependencies from one run to the next. As well, this seems like more
> of an edge case that common case so it shouldn't be the only thing
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  benchtests/bench-memcpy.c  | 49 +++++++++++++++++++++++++++++++++-----
>  benchtests/bench-memmove.c | 26 +++++++++++++++++---
>  2 files changed, 66 insertions(+), 9 deletions(-)
>
> diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
> index d9236a2282..744bea26d3 100644
> --- a/benchtests/bench-memcpy.c
> +++ b/benchtests/bench-memcpy.c
> @@ -40,7 +40,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, const char *src,
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> -
> +  for (i = 0; i < iters / 64; ++i)
> +    {
> +      CALL (impl, dst, src, len);
> +    }
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> @@ -60,11 +63,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
>    size_t i, j;
>    char *s1, *s2;
>    size_t repeats;
> -  align1 &= 63;
> +  align1 &= (getpagesize () - 1);
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= (getpagesize () - 1);
>    if (align2 + len >= page_size)
>      return;
>
> @@ -99,7 +102,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> -
> +  size_t half_page = getpagesize () / 2;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -121,8 +124,15 @@ test_main (void)
>      {
>        do_test (&json_ctx, 0, 0, 1 << i, 1);
>        do_test (&json_ctx, i, 0, 1 << i, 1);
> +      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
>        do_test (&json_ctx, 0, i, 1 << i, 1);
> +      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
>        do_test (&json_ctx, i, i, 1 << i, 1);
> +      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
> +      do_test (&json_ctx, half_page, 0, 1 << i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
> +      do_test (&json_ctx, half_page, i, 1 << i, 1);
> +      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
>      }
>
>    for (i = 0; i < 32; ++i)
> @@ -131,16 +141,26 @@ test_main (void)
>        do_test (&json_ctx, i, 0, i, 0);
>        do_test (&json_ctx, 0, i, i, 0);
>        do_test (&json_ctx, i, i, i, 0);
> +      do_test (&json_ctx, half_page, 0, i, 0);
> +      do_test (&json_ctx, half_page + i, 0, i, 0);
> +      do_test (&json_ctx, half_page, i, i, 0);
> +      do_test (&json_ctx, half_page + i, i, i, 0);
> +      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
> +      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
>      }
>
>    for (i = 3; i < 32; ++i)
>      {
>        if ((i & (i - 1)) == 0)
> -       continue;
> +        continue;
>        do_test (&json_ctx, 0, 0, 16 * i, 1);
>        do_test (&json_ctx, i, 0, 16 * i, 1);
>        do_test (&json_ctx, 0, i, 16 * i, 1);
>        do_test (&json_ctx, i, i, 16 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 16 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
> +      do_test (&json_ctx, half_page, i, 16 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
>      }
>
>    for (i = 32; i < 64; ++i)
> @@ -149,16 +169,33 @@ test_main (void)
>        do_test (&json_ctx, i, 0, 32 * i, 1);
>        do_test (&json_ctx, 0, i, 32 * i, 1);
>        do_test (&json_ctx, i, i, 32 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 32 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
> +      do_test (&json_ctx, half_page, i, 32 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
>      }
>
>    do_test (&json_ctx, 0, 0, getpagesize (), 1);
>
> -  for (i = 0; i <= 32; ++i)
> +  for (i = 0; i <= 48; ++i)
>      {
>        do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
>        do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
>        do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
>        do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
>      }
>
>    json_array_end (&json_ctx);
> diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
> index 6becbf4782..855f4d0649 100644
> --- a/benchtests/bench-memmove.c
> +++ b/benchtests/bench-memmove.c
> @@ -34,7 +34,10 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> -
> +  for (i = 0; i < iters / 64; ++i)
> +    {
> +      CALL (impl, dst, src, len);
> +    }
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> @@ -53,11 +56,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>
> -  align1 &= 63;
> +  align1 &= (getpagesize () - 1);
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= (getpagesize () - 1);
>    if (align2 + len >= page_size)
>      return;
>
> @@ -85,6 +88,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> +  size_t half_page = getpagesize () / 2;
>
>    test_init ();
>
> @@ -138,6 +142,22 @@ test_main (void)
>        do_test (&json_ctx, i, i, 32 * i);
>      }
>
> +  for (i = 0; i <= 48; ++i)
> +    {
> +      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, i, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, i, 2048 + 64 * i);
> +      do_test (&json_ctx, i, i, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
> +      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
> +    }
> +
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                     ` (3 preceding siblings ...)
  2021-11-06 18:33   ` [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
@ 2021-11-06 19:12   ` H.J. Lu
  2021-11-06 21:20     ` Noah Goldstein
  2021-12-07 21:10   ` Stafford Horne
  5 siblings, 1 reply; 46+ messages in thread
From: H.J. Lu @ 2021-11-06 19:12 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit updates the memcpy tests to test both dst > src and dst <
> src. This is because there is logic in the code based on the
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
>  string/test-memmove.c |  75 ++++++++++++++++++-
>  2 files changed, 214 insertions(+), 28 deletions(-)
>
> diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> index c9e965bed3..3b0f3127b7 100644
> --- a/string/test-memcpy.c
> +++ b/string/test-memcpy.c
> @@ -17,6 +17,7 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #ifndef MEMCPY_RESULT
> +# define DO_EXTRA_TESTS
>  # define MEMCPY_RESULT(dst, len) dst
>  # define MIN_PAGE_SIZE 131072
>  # define TEST_MAIN
> @@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
>  static void
>  do_test (size_t align1, size_t align2, size_t len)
>  {
> -  size_t i, j;
> +  size_t i, j, repeats;
>    char *s1, *s2;
>
>    align1 &= 4095;
> @@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
>
>    s1 = (char *) (buf1 + align1);
>    s2 = (char *) (buf2 + align2);
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
> +      for (i = 0, j = 1; i < len; i++, j += 23)
> +        s1[i] = j;
>
> -  for (i = 0, j = 1; i < len; i++, j += 23)
> -    s1[i] = j;
> -
> -  FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len);
> +      FOR_EACH_IMPL (impl, 0)
> +        do_one_test (impl, s2, s1, len);
> +    }
>  }
>
>  static void
> @@ -212,56 +215,87 @@ do_random_tests (void)
>  }
>
>  static void
> -do_test1 (size_t size)
> +do_test1 (size_t align1, size_t align2, size_t size)
>  {
>    void *large_buf;
> -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> -                   MAP_PRIVATE | MAP_ANON, -1, 0);
> +  size_t mmap_size, region_size;
> +
> +  align1 &= (page_size - 1);
> +  if (align1 == 0)
> +    align1 = page_size;
> +
> +  align2 &= (page_size - 1);
> +  if (align2 == 0)
> +    align2 = page_size;
> +
> +  region_size = (size + page_size - 1) & (~(page_size - 1));
> +
> +  mmap_size = region_size * 2 + 3 * page_size;
> +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANON, -1, 0);
>    if (large_buf == MAP_FAILED)
>      {
> -      puts ("Failed to allocat large_buf, skipping do_test1");
> +      puts ("Failed to allocate large_buf, skipping do_test1");
>        return;
>      }
> -
> -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
>      error (EXIT_FAILURE, errno, "mprotect failed");
>
> -  size_t arrary_size = size / sizeof (uint32_t);
> -  uint32_t *dest = large_buf;
> -  uint32_t *src = large_buf + size + page_size;
> +  size_t array_size = size / sizeof (uint32_t);
> +  uint32_t *dest = large_buf + align1;
> +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
>    size_t i;
>    size_t repeats;
>    for(repeats = 0; repeats < 2; repeats++)
>      {
> -      for (i = 0; i < arrary_size; i++)
> +      for (i = 0; i < array_size; i++)
>          src[i] = (uint32_t) i;
> -
>        FOR_EACH_IMPL (impl, 0)
>          {
> -            printf ("\t\tRunning: %s\n", impl->name);
>            memset (dest, -1, size);
>            CALL (impl, (char *) dest, (char *) src, size);
> -          for (i = 0; i < arrary_size; i++)
> +          for (i = 0; i < array_size; i++)
>          if (dest[i] != src[i])
>            {
>              error (0, 0,
>                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
>                 impl->name, dest, src, i);
>              ret = 1;
> -            munmap ((void *) large_buf, size * 2 + page_size);
> +            munmap ((void *) large_buf, mmap_size);
>              return;
>            }
>          }
> -      dest = src;
> -      src = large_buf;
> +      dest = large_buf + region_size + 2 * page_size + align1;
> +      src = large_buf + align2;
> +    }
> +  munmap ((void *) large_buf, mmap_size);
> +}
> +
> +static void
> +do_random_large_tests (void)
> +{
> +  size_t i, align1, align2, size;
> +  for (i = 0; i < 32; ++i)
> +    {
> +      align1 = random ();
> +      align2 = random ();
> +      size = (random() % 0x1000000) + 0x200000;
> +      do_test1 (align1, align2, size);
> +    }
> +
> +  for (i = 0; i < 128; ++i)
> +    {
> +      align1 = random ();
> +      align2 = random ();
> +      size = (random() % 32768) + 4096;
> +      do_test1 (align1, align2, size);
>      }
> -  munmap ((void *) large_buf, size * 2 + page_size);
>  }
>
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  size_t i, j;
>
>    test_init ();
>
> @@ -298,6 +332,7 @@ test_main (void)
>    for (i = 19; i <= 25; ++i)
>      {
>        do_test (255, 0, 1 << i);
> +      do_test (0, 4000, 1 << i);
>        do_test (0, 255, i);
>        do_test (0, 4000, i);
>      }
> @@ -306,8 +341,88 @@ test_main (void)
>
>    do_random_tests ();
>
> -  do_test1 (0x100000);
> -  do_test1 (0x2000000);
> +  do_test1 (0, 0, 0x100000);
> +  do_test1 (0, 0, 0x2000000);
> +
> +  for (i = 4096; i < 32768; i += 4096)
> +    {
> +      for (j = 1; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +        }
> +    }
> +
> +  for (i = 0x300000; i < 0x2000000; i += 0x235689)
> +    {
> +      for (j = 64; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +        }
> +    }
> +#ifdef DO_EXTRA_TESTS
> +  for (i = 0x200000; i <= 0x2000000; i += i)
> +    {
> +      for (j = 64; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +
> +          do_test1 (0, j, i + 1);
> +          do_test1 (4095, j, i + 1);
> +          do_test1 (4096 - j, 0, i + 1);
> +
> +          do_test1 (0, j - 1, i + 1);
> +          do_test1 (4095, j - 1, i + 1);
> +          do_test1 (4096 - j - 1, 0, i + 1);
> +
> +          do_test1 (0, j + 1, i + 1);
> +          do_test1 (4095, j + 1, i + 1);
> +          do_test1 (4096 - j, 1, i + 1);
> +
> +          do_test1 (0, j, i - 1);
> +          do_test1 (4095, j, i - 1);
> +          do_test1 (4096 - j, 0, i - 1);
> +
> +          do_test1 (0, j - 1, i - 1);
> +          do_test1 (4095, j - 1, i - 1);
> +          do_test1 (4096 - j - 1, 0, i - 1);
> +
> +          do_test1 (0, j + 1, i - 1);
> +          do_test1 (4095, j + 1, i - 1);
> +          do_test1 (4096 - j, 1, i - 1);
> +        }
> +    }
> +#endif
> +  do_random_large_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-memmove.c b/string/test-memmove.c
> index a0ce8b0334..5c6d1579e3 100644
> --- a/string/test-memmove.c
> +++ b/string/test-memmove.c
> @@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>
> -  align1 &= 63;
> +  align1 &= (getpagesize() - 1);
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= (getpagesize() - 1);
>    if (align2 + len >= page_size)
>      return;
>
> @@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
>    munmap ((void *) buf, size);
>  }
>
> +static void
> +do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
> +{
> +  size_t size, repeats, i;
> +  uint8_t *buf, *dst, *src;
> +
> +  size = bytes_move + MAX(offset1, offset2);
> +  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +             MAP_PRIVATE | MAP_ANON, -1, 0);
> +
> +  if (buf == MAP_FAILED)
> +    error (EXIT_UNSUPPORTED, errno, "mmap failed");
> +
> +  dst = &buf[offset1];
> +  src = &buf[offset2];
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
> +      FOR_EACH_IMPL (impl, 0)
> +        {
> +          for (i = 0; i < bytes_move; i++)
> +              src[i] = (uint8_t) i;
> +#ifdef TEST_BCOPY
> +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> +#else
> +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> +#endif
> +          for (i = 0; i < bytes_move; i++)
> +            {
> +              if (dst[i] != (uint8_t) i)
> +                {
> +                  error (0, 0,
> +                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> +                         impl->name, dst, buf, i);
> +                  ret = 1;
> +                  break;
> +                }
> +            }
> +        }
> +      dst = &buf[offset2];
> +      src = &buf[offset1];
> +    }
> +  munmap ((void *) buf, size);
> +}
> +
> +
>  int
>  test_main (void)
>  {
> @@ -395,13 +440,39 @@ test_main (void)
>
>    do_random_tests ();
>
> +  do_test2 (0);
>    do_test2 (33);
> +  do_test2 (0x200000 - 1);
>    do_test2 (0x200000);
> +  do_test2 (0x200000 + 1);
> +  do_test2 (0x1000000 - 1);
> +  do_test2 (0x1000000);
> +  do_test2 (0x1000000 + 1);
>    do_test2 (0x4000000 - 1);
>    do_test2 (0x4000000);
> +  do_test2 (0x4000000 + 1);
>
>    /* Copy 16KB data.  */
>    do_test3 (16384, 3);
> +  for (i = 4096; i <= 16384; i <<= 1)
> +    {
> +      do_test4 (i, 0, i);
> +      do_test4 (i, 0, i - 1);
> +      do_test4 (i, 0, i + 1);
> +      do_test4 (i, 63, i + 63);
> +      do_test4 (i, 63, i + 64);
> +      do_test4 (i, 63, i);
> +
> +      do_test4 (i, 0, 1);
> +      do_test4 (i, 0, 15);
> +      do_test4 (i, 0, 31);
> +      do_test4 (i, 0, 63);
> +      do_test4 (i, 0, 64);
> +      do_test4 (i, 0, 65);
> +      do_test4 (i, 0, 127);
> +      do_test4 (i, 0, 129);
> +    }
> +
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-06 19:12   ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
@ 2021-11-06 21:20     ` Noah Goldstein
  2021-11-07 13:53       ` H.J. Lu
  0 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-11-06 21:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This commit updates the memcpy tests to test both dst > src and dst <
> > src. This is because there is logic in the code based on the
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
> >  string/test-memmove.c |  75 ++++++++++++++++++-
> >  2 files changed, 214 insertions(+), 28 deletions(-)
> >
> > diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> > index c9e965bed3..3b0f3127b7 100644
> > --- a/string/test-memcpy.c
> > +++ b/string/test-memcpy.c
> > @@ -17,6 +17,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #ifndef MEMCPY_RESULT
> > +# define DO_EXTRA_TESTS
> >  # define MEMCPY_RESULT(dst, len) dst
> >  # define MIN_PAGE_SIZE 131072
> >  # define TEST_MAIN
> > @@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
> >  static void
> >  do_test (size_t align1, size_t align2, size_t len)
> >  {
> > -  size_t i, j;
> > +  size_t i, j, repeats;
> >    char *s1, *s2;
> >
> >    align1 &= 4095;
> > @@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
> >
> >    s1 = (char *) (buf1 + align1);
> >    s2 = (char *) (buf2 + align2);
> > +  for (repeats = 0; repeats < 2; ++repeats)
> > +    {
> > +      for (i = 0, j = 1; i < len; i++, j += 23)
> > +        s1[i] = j;
> >
> > -  for (i = 0, j = 1; i < len; i++, j += 23)
> > -    s1[i] = j;
> > -
> > -  FOR_EACH_IMPL (impl, 0)
> > -    do_one_test (impl, s2, s1, len);
> > +      FOR_EACH_IMPL (impl, 0)
> > +        do_one_test (impl, s2, s1, len);
> > +    }
> >  }
> >
> >  static void
> > @@ -212,56 +215,87 @@ do_random_tests (void)
> >  }
> >
> >  static void
> > -do_test1 (size_t size)
> > +do_test1 (size_t align1, size_t align2, size_t size)
> >  {
> >    void *large_buf;
> > -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> > -                   MAP_PRIVATE | MAP_ANON, -1, 0);
> > +  size_t mmap_size, region_size;
> > +
> > +  align1 &= (page_size - 1);
> > +  if (align1 == 0)
> > +    align1 = page_size;
> > +
> > +  align2 &= (page_size - 1);
> > +  if (align2 == 0)
> > +    align2 = page_size;
> > +
> > +  region_size = (size + page_size - 1) & (~(page_size - 1));
> > +
> > +  mmap_size = region_size * 2 + 3 * page_size;
> > +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> > +                   MAP_PRIVATE | MAP_ANON, -1, 0);
> >    if (large_buf == MAP_FAILED)
> >      {
> > -      puts ("Failed to allocat large_buf, skipping do_test1");
> > +      puts ("Failed to allocate large_buf, skipping do_test1");
> >        return;
> >      }
> > -
> > -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> > +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
> >      error (EXIT_FAILURE, errno, "mprotect failed");
> >
> > -  size_t arrary_size = size / sizeof (uint32_t);
> > -  uint32_t *dest = large_buf;
> > -  uint32_t *src = large_buf + size + page_size;
> > +  size_t array_size = size / sizeof (uint32_t);
> > +  uint32_t *dest = large_buf + align1;
> > +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
> >    size_t i;
> >    size_t repeats;
> >    for(repeats = 0; repeats < 2; repeats++)
> >      {
> > -      for (i = 0; i < arrary_size; i++)
> > +      for (i = 0; i < array_size; i++)
> >          src[i] = (uint32_t) i;
> > -
> >        FOR_EACH_IMPL (impl, 0)
> >          {
> > -            printf ("\t\tRunning: %s\n", impl->name);
> >            memset (dest, -1, size);
> >            CALL (impl, (char *) dest, (char *) src, size);
> > -          for (i = 0; i < arrary_size; i++)
> > +          for (i = 0; i < array_size; i++)
> >          if (dest[i] != src[i])
> >            {
> >              error (0, 0,
> >                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> >                 impl->name, dest, src, i);
> >              ret = 1;
> > -            munmap ((void *) large_buf, size * 2 + page_size);
> > +            munmap ((void *) large_buf, mmap_size);
> >              return;
> >            }
> >          }
> > -      dest = src;
> > -      src = large_buf;
> > +      dest = large_buf + region_size + 2 * page_size + align1;
> > +      src = large_buf + align2;
> > +    }
> > +  munmap ((void *) large_buf, mmap_size);
> > +}
> > +
> > +static void
> > +do_random_large_tests (void)
> > +{
> > +  size_t i, align1, align2, size;
> > +  for (i = 0; i < 32; ++i)
> > +    {
> > +      align1 = random ();
> > +      align2 = random ();
> > +      size = (random() % 0x1000000) + 0x200000;
> > +      do_test1 (align1, align2, size);
> > +    }
> > +
> > +  for (i = 0; i < 128; ++i)
> > +    {
> > +      align1 = random ();
> > +      align2 = random ();
> > +      size = (random() % 32768) + 4096;
> > +      do_test1 (align1, align2, size);
> >      }
> > -  munmap ((void *) large_buf, size * 2 + page_size);
> >  }
> >
> >  int
> >  test_main (void)
> >  {
> > -  size_t i;
> > +  size_t i, j;
> >
> >    test_init ();
> >
> > @@ -298,6 +332,7 @@ test_main (void)
> >    for (i = 19; i <= 25; ++i)
> >      {
> >        do_test (255, 0, 1 << i);
> > +      do_test (0, 4000, 1 << i);
> >        do_test (0, 255, i);
> >        do_test (0, 4000, i);
> >      }
> > @@ -306,8 +341,88 @@ test_main (void)
> >
> >    do_random_tests ();
> >
> > -  do_test1 (0x100000);
> > -  do_test1 (0x2000000);
> > +  do_test1 (0, 0, 0x100000);
> > +  do_test1 (0, 0, 0x2000000);
> > +
> > +  for (i = 4096; i < 32768; i += 4096)
> > +    {
> > +      for (j = 1; j <= 1024; j <<= 1)
> > +        {
> > +          do_test1 (0, j, i);
> > +          do_test1 (4095, j, i);
> > +          do_test1 (4096 - j, 0, i);
> > +
> > +          do_test1 (0, j - 1, i);
> > +          do_test1 (4095, j - 1, i);
> > +          do_test1 (4096 - j - 1, 0, i);
> > +
> > +          do_test1 (0, j + 1, i);
> > +          do_test1 (4095, j + 1, i);
> > +          do_test1 (4096 - j, 1, i);
> > +        }
> > +    }
> > +
> > +  for (i = 0x300000; i < 0x2000000; i += 0x235689)
> > +    {
> > +      for (j = 64; j <= 1024; j <<= 1)
> > +        {
> > +          do_test1 (0, j, i);
> > +          do_test1 (4095, j, i);
> > +          do_test1 (4096 - j, 0, i);
> > +
> > +          do_test1 (0, j - 1, i);
> > +          do_test1 (4095, j - 1, i);
> > +          do_test1 (4096 - j - 1, 0, i);
> > +
> > +          do_test1 (0, j + 1, i);
> > +          do_test1 (4095, j + 1, i);
> > +          do_test1 (4096 - j, 1, i);
> > +        }
> > +    }
> > +#ifdef DO_EXTRA_TESTS
> > +  for (i = 0x200000; i <= 0x2000000; i += i)
> > +    {
> > +      for (j = 64; j <= 1024; j <<= 1)
> > +        {
> > +          do_test1 (0, j, i);
> > +          do_test1 (4095, j, i);
> > +          do_test1 (4096 - j, 0, i);
> > +
> > +          do_test1 (0, j - 1, i);
> > +          do_test1 (4095, j - 1, i);
> > +          do_test1 (4096 - j - 1, 0, i);
> > +
> > +          do_test1 (0, j + 1, i);
> > +          do_test1 (4095, j + 1, i);
> > +          do_test1 (4096 - j, 1, i);
> > +
> > +          do_test1 (0, j, i + 1);
> > +          do_test1 (4095, j, i + 1);
> > +          do_test1 (4096 - j, 0, i + 1);
> > +
> > +          do_test1 (0, j - 1, i + 1);
> > +          do_test1 (4095, j - 1, i + 1);
> > +          do_test1 (4096 - j - 1, 0, i + 1);
> > +
> > +          do_test1 (0, j + 1, i + 1);
> > +          do_test1 (4095, j + 1, i + 1);
> > +          do_test1 (4096 - j, 1, i + 1);
> > +
> > +          do_test1 (0, j, i - 1);
> > +          do_test1 (4095, j, i - 1);
> > +          do_test1 (4096 - j, 0, i - 1);
> > +
> > +          do_test1 (0, j - 1, i - 1);
> > +          do_test1 (4095, j - 1, i - 1);
> > +          do_test1 (4096 - j - 1, 0, i - 1);
> > +
> > +          do_test1 (0, j + 1, i - 1);
> > +          do_test1 (4095, j + 1, i - 1);
> > +          do_test1 (4096 - j, 1, i - 1);
> > +        }
> > +    }
> > +#endif
> > +  do_random_large_tests ();
> >    return ret;
> >  }
> >
> > diff --git a/string/test-memmove.c b/string/test-memmove.c
> > index a0ce8b0334..5c6d1579e3 100644
> > --- a/string/test-memmove.c
> > +++ b/string/test-memmove.c
> > @@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
> >    size_t i, j;
> >    char *s1, *s2;
> >
> > -  align1 &= 63;
> > +  align1 &= (getpagesize() - 1);
> >    if (align1 + len >= page_size)
> >      return;
> >
> > -  align2 &= 63;
> > +  align2 &= (getpagesize() - 1);
> >    if (align2 + len >= page_size)
> >      return;
> >
> > @@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
> >    munmap ((void *) buf, size);
> >  }
> >
> > +static void
> > +do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
> > +{
> > +  size_t size, repeats, i;
> > +  uint8_t *buf, *dst, *src;
> > +
> > +  size = bytes_move + MAX(offset1, offset2);
> > +  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
> > +             MAP_PRIVATE | MAP_ANON, -1, 0);
> > +
> > +  if (buf == MAP_FAILED)
> > +    error (EXIT_UNSUPPORTED, errno, "mmap failed");
> > +
> > +  dst = &buf[offset1];
> > +  src = &buf[offset2];
> > +  for (repeats = 0; repeats < 2; ++repeats)
> > +    {
> > +      FOR_EACH_IMPL (impl, 0)
> > +        {
> > +          for (i = 0; i < bytes_move; i++)
> > +              src[i] = (uint8_t) i;
> > +#ifdef TEST_BCOPY
> > +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> > +#else
> > +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> > +#endif
> > +          for (i = 0; i < bytes_move; i++)
> > +            {
> > +              if (dst[i] != (uint8_t) i)
> > +                {
> > +                  error (0, 0,
> > +                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > +                         impl->name, dst, buf, i);
> > +                  ret = 1;
> > +                  break;
> > +                }
> > +            }
> > +        }
> > +      dst = &buf[offset2];
> > +      src = &buf[offset1];
> > +    }
> > +  munmap ((void *) buf, size);
> > +}
> > +
> > +
> >  int
> >  test_main (void)
> >  {
> > @@ -395,13 +440,39 @@ test_main (void)
> >
> >    do_random_tests ();
> >
> > +  do_test2 (0);
> >    do_test2 (33);
> > +  do_test2 (0x200000 - 1);
> >    do_test2 (0x200000);
> > +  do_test2 (0x200000 + 1);
> > +  do_test2 (0x1000000 - 1);
> > +  do_test2 (0x1000000);
> > +  do_test2 (0x1000000 + 1);
> >    do_test2 (0x4000000 - 1);
> >    do_test2 (0x4000000);
> > +  do_test2 (0x4000000 + 1);
> >
> >    /* Copy 16KB data.  */
> >    do_test3 (16384, 3);
> > +  for (i = 4096; i <= 16384; i <<= 1)
> > +    {
> > +      do_test4 (i, 0, i);
> > +      do_test4 (i, 0, i - 1);
> > +      do_test4 (i, 0, i + 1);
> > +      do_test4 (i, 63, i + 63);
> > +      do_test4 (i, 63, i + 64);
> > +      do_test4 (i, 63, i);
> > +
> > +      do_test4 (i, 0, 1);
> > +      do_test4 (i, 0, 15);
> > +      do_test4 (i, 0, 31);
> > +      do_test4 (i, 0, 63);
> > +      do_test4 (i, 0, 64);
> > +      do_test4 (i, 0, 65);
> > +      do_test4 (i, 0, 127);
> > +      do_test4 (i, 0, 129);
> > +    }
> > +
> >
> >    return ret;
> >  }
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.

Thanks. Pushed the patchset.

>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-06 21:20     ` Noah Goldstein
@ 2021-11-07 13:53       ` H.J. Lu
  0 siblings, 0 replies; 46+ messages in thread
From: H.J. Lu @ 2021-11-07 13:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Nov 6, 2021 at 2:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > This commit updates the memcpy tests to test both dst > src and dst <
> > > src. This is because there is logic in the code based on the
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > >  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
> > >  string/test-memmove.c |  75 ++++++++++++++++++-
> > >  2 files changed, 214 insertions(+), 28 deletions(-)
> > >
> > > diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> > > index c9e965bed3..3b0f3127b7 100644
> > > --- a/string/test-memcpy.c
> > > +++ b/string/test-memcpy.c
> > > @@ -17,6 +17,7 @@
> > >     <https://www.gnu.org/licenses/>.  */
> > >
> > >  #ifndef MEMCPY_RESULT
> > > +# define DO_EXTRA_TESTS
> > >  # define MEMCPY_RESULT(dst, len) dst
> > >  # define MIN_PAGE_SIZE 131072
> > >  # define TEST_MAIN
> > > @@ -78,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
> > >  static void
> > >  do_test (size_t align1, size_t align2, size_t len)
> > >  {
> > > -  size_t i, j;
> > > +  size_t i, j, repeats;
> > >    char *s1, *s2;
> > >
> > >    align1 &= 4095;
> > > @@ -91,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
> > >
> > >    s1 = (char *) (buf1 + align1);
> > >    s2 = (char *) (buf2 + align2);
> > > +  for (repeats = 0; repeats < 2; ++repeats)
> > > +    {
> > > +      for (i = 0, j = 1; i < len; i++, j += 23)
> > > +        s1[i] = j;
> > >
> > > -  for (i = 0, j = 1; i < len; i++, j += 23)
> > > -    s1[i] = j;
> > > -
> > > -  FOR_EACH_IMPL (impl, 0)
> > > -    do_one_test (impl, s2, s1, len);
> > > +      FOR_EACH_IMPL (impl, 0)
> > > +        do_one_test (impl, s2, s1, len);
> > > +    }
> > >  }
> > >
> > >  static void
> > > @@ -212,56 +215,87 @@ do_random_tests (void)
> > >  }
> > >
> > >  static void
> > > -do_test1 (size_t size)
> > > +do_test1 (size_t align1, size_t align2, size_t size)
> > >  {
> > >    void *large_buf;
> > > -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> > > -                   MAP_PRIVATE | MAP_ANON, -1, 0);
> > > +  size_t mmap_size, region_size;
> > > +
> > > +  align1 &= (page_size - 1);
> > > +  if (align1 == 0)
> > > +    align1 = page_size;
> > > +
> > > +  align2 &= (page_size - 1);
> > > +  if (align2 == 0)
> > > +    align2 = page_size;
> > > +
> > > +  region_size = (size + page_size - 1) & (~(page_size - 1));
> > > +
> > > +  mmap_size = region_size * 2 + 3 * page_size;
> > > +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> > > +                   MAP_PRIVATE | MAP_ANON, -1, 0);
> > >    if (large_buf == MAP_FAILED)
> > >      {
> > > -      puts ("Failed to allocat large_buf, skipping do_test1");
> > > +      puts ("Failed to allocate large_buf, skipping do_test1");
> > >        return;
> > >      }
> > > -
> > > -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> > > +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
> > >      error (EXIT_FAILURE, errno, "mprotect failed");
> > >
> > > -  size_t arrary_size = size / sizeof (uint32_t);
> > > -  uint32_t *dest = large_buf;
> > > -  uint32_t *src = large_buf + size + page_size;
> > > +  size_t array_size = size / sizeof (uint32_t);
> > > +  uint32_t *dest = large_buf + align1;
> > > +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
> > >    size_t i;
> > >    size_t repeats;
> > >    for(repeats = 0; repeats < 2; repeats++)
> > >      {
> > > -      for (i = 0; i < arrary_size; i++)
> > > +      for (i = 0; i < array_size; i++)
> > >          src[i] = (uint32_t) i;
> > > -
> > >        FOR_EACH_IMPL (impl, 0)
> > >          {
> > > -            printf ("\t\tRunning: %s\n", impl->name);
> > >            memset (dest, -1, size);
> > >            CALL (impl, (char *) dest, (char *) src, size);
> > > -          for (i = 0; i < arrary_size; i++)
> > > +          for (i = 0; i < array_size; i++)
> > >          if (dest[i] != src[i])
> > >            {
> > >              error (0, 0,
> > >                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > >                 impl->name, dest, src, i);
> > >              ret = 1;
> > > -            munmap ((void *) large_buf, size * 2 + page_size);
> > > +            munmap ((void *) large_buf, mmap_size);
> > >              return;
> > >            }
> > >          }
> > > -      dest = src;
> > > -      src = large_buf;
> > > +      dest = large_buf + region_size + 2 * page_size + align1;
> > > +      src = large_buf + align2;
> > > +    }
> > > +  munmap ((void *) large_buf, mmap_size);
> > > +}
> > > +
> > > +static void
> > > +do_random_large_tests (void)
> > > +{
> > > +  size_t i, align1, align2, size;
> > > +  for (i = 0; i < 32; ++i)
> > > +    {
> > > +      align1 = random ();
> > > +      align2 = random ();
> > > +      size = (random() % 0x1000000) + 0x200000;
> > > +      do_test1 (align1, align2, size);
> > > +    }
> > > +
> > > +  for (i = 0; i < 128; ++i)
> > > +    {
> > > +      align1 = random ();
> > > +      align2 = random ();
> > > +      size = (random() % 32768) + 4096;
> > > +      do_test1 (align1, align2, size);
> > >      }
> > > -  munmap ((void *) large_buf, size * 2 + page_size);
> > >  }
> > >
> > >  int
> > >  test_main (void)
> > >  {
> > > -  size_t i;
> > > +  size_t i, j;
> > >
> > >    test_init ();
> > >
> > > @@ -298,6 +332,7 @@ test_main (void)
> > >    for (i = 19; i <= 25; ++i)
> > >      {
> > >        do_test (255, 0, 1 << i);
> > > +      do_test (0, 4000, 1 << i);
> > >        do_test (0, 255, i);
> > >        do_test (0, 4000, i);
> > >      }
> > > @@ -306,8 +341,88 @@ test_main (void)
> > >
> > >    do_random_tests ();
> > >
> > > -  do_test1 (0x100000);
> > > -  do_test1 (0x2000000);
> > > +  do_test1 (0, 0, 0x100000);
> > > +  do_test1 (0, 0, 0x2000000);
> > > +
> > > +  for (i = 4096; i < 32768; i += 4096)
> > > +    {
> > > +      for (j = 1; j <= 1024; j <<= 1)
> > > +        {
> > > +          do_test1 (0, j, i);
> > > +          do_test1 (4095, j, i);
> > > +          do_test1 (4096 - j, 0, i);
> > > +
> > > +          do_test1 (0, j - 1, i);
> > > +          do_test1 (4095, j - 1, i);
> > > +          do_test1 (4096 - j - 1, 0, i);
> > > +
> > > +          do_test1 (0, j + 1, i);
> > > +          do_test1 (4095, j + 1, i);
> > > +          do_test1 (4096 - j, 1, i);
> > > +        }
> > > +    }
> > > +
> > > +  for (i = 0x300000; i < 0x2000000; i += 0x235689)
> > > +    {
> > > +      for (j = 64; j <= 1024; j <<= 1)
> > > +        {
> > > +          do_test1 (0, j, i);
> > > +          do_test1 (4095, j, i);
> > > +          do_test1 (4096 - j, 0, i);
> > > +
> > > +          do_test1 (0, j - 1, i);
> > > +          do_test1 (4095, j - 1, i);
> > > +          do_test1 (4096 - j - 1, 0, i);
> > > +
> > > +          do_test1 (0, j + 1, i);
> > > +          do_test1 (4095, j + 1, i);
> > > +          do_test1 (4096 - j, 1, i);
> > > +        }
> > > +    }
> > > +#ifdef DO_EXTRA_TESTS
> > > +  for (i = 0x200000; i <= 0x2000000; i += i)
> > > +    {
> > > +      for (j = 64; j <= 1024; j <<= 1)
> > > +        {
> > > +          do_test1 (0, j, i);
> > > +          do_test1 (4095, j, i);
> > > +          do_test1 (4096 - j, 0, i);
> > > +
> > > +          do_test1 (0, j - 1, i);
> > > +          do_test1 (4095, j - 1, i);
> > > +          do_test1 (4096 - j - 1, 0, i);
> > > +
> > > +          do_test1 (0, j + 1, i);
> > > +          do_test1 (4095, j + 1, i);
> > > +          do_test1 (4096 - j, 1, i);
> > > +
> > > +          do_test1 (0, j, i + 1);
> > > +          do_test1 (4095, j, i + 1);
> > > +          do_test1 (4096 - j, 0, i + 1);
> > > +
> > > +          do_test1 (0, j - 1, i + 1);
> > > +          do_test1 (4095, j - 1, i + 1);
> > > +          do_test1 (4096 - j - 1, 0, i + 1);
> > > +
> > > +          do_test1 (0, j + 1, i + 1);
> > > +          do_test1 (4095, j + 1, i + 1);
> > > +          do_test1 (4096 - j, 1, i + 1);
> > > +
> > > +          do_test1 (0, j, i - 1);
> > > +          do_test1 (4095, j, i - 1);
> > > +          do_test1 (4096 - j, 0, i - 1);
> > > +
> > > +          do_test1 (0, j - 1, i - 1);
> > > +          do_test1 (4095, j - 1, i - 1);
> > > +          do_test1 (4096 - j - 1, 0, i - 1);
> > > +
> > > +          do_test1 (0, j + 1, i - 1);
> > > +          do_test1 (4095, j + 1, i - 1);
> > > +          do_test1 (4096 - j, 1, i - 1);
> > > +        }
> > > +    }
> > > +#endif
> > > +  do_random_large_tests ();
> > >    return ret;
> > >  }
> > >
> > > diff --git a/string/test-memmove.c b/string/test-memmove.c
> > > index a0ce8b0334..5c6d1579e3 100644
> > > --- a/string/test-memmove.c
> > > +++ b/string/test-memmove.c
> > > @@ -100,11 +100,11 @@ do_test (size_t align1, size_t align2, size_t len)
> > >    size_t i, j;
> > >    char *s1, *s2;
> > >
> > > -  align1 &= 63;
> > > +  align1 &= (getpagesize() - 1);
> > >    if (align1 + len >= page_size)
> > >      return;
> > >
> > > -  align2 &= 63;
> > > +  align2 &= (getpagesize() - 1);
> > >    if (align2 + len >= page_size)
> > >      return;
> > >
> > > @@ -355,6 +355,51 @@ do_test3 (size_t bytes_move, size_t offset)
> > >    munmap ((void *) buf, size);
> > >  }
> > >
> > > +static void
> > > +do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
> > > +{
> > > +  size_t size, repeats, i;
> > > +  uint8_t *buf, *dst, *src;
> > > +
> > > +  size = bytes_move + MAX(offset1, offset2);
> > > +  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
> > > +             MAP_PRIVATE | MAP_ANON, -1, 0);
> > > +
> > > +  if (buf == MAP_FAILED)
> > > +    error (EXIT_UNSUPPORTED, errno, "mmap failed");
> > > +
> > > +  dst = &buf[offset1];
> > > +  src = &buf[offset2];
> > > +  for (repeats = 0; repeats < 2; ++repeats)
> > > +    {
> > > +      FOR_EACH_IMPL (impl, 0)
> > > +        {
> > > +          for (i = 0; i < bytes_move; i++)
> > > +              src[i] = (uint8_t) i;
> > > +#ifdef TEST_BCOPY
> > > +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> > > +#else
> > > +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> > > +#endif
> > > +          for (i = 0; i < bytes_move; i++)
> > > +            {
> > > +              if (dst[i] != (uint8_t) i)
> > > +                {
> > > +                  error (0, 0,
> > > +                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > > +                         impl->name, dst, buf, i);
> > > +                  ret = 1;
> > > +                  break;
> > > +                }
> > > +            }
> > > +        }
> > > +      dst = &buf[offset2];
> > > +      src = &buf[offset1];
> > > +    }
> > > +  munmap ((void *) buf, size);
> > > +}
> > > +
> > > +
> > >  int
> > >  test_main (void)
> > >  {
> > > @@ -395,13 +440,39 @@ test_main (void)
> > >
> > >    do_random_tests ();
> > >
> > > +  do_test2 (0);
> > >    do_test2 (33);
> > > +  do_test2 (0x200000 - 1);
> > >    do_test2 (0x200000);
> > > +  do_test2 (0x200000 + 1);
> > > +  do_test2 (0x1000000 - 1);
> > > +  do_test2 (0x1000000);
> > > +  do_test2 (0x1000000 + 1);
> > >    do_test2 (0x4000000 - 1);
> > >    do_test2 (0x4000000);
> > > +  do_test2 (0x4000000 + 1);
> > >
> > >    /* Copy 16KB data.  */
> > >    do_test3 (16384, 3);
> > > +  for (i = 4096; i <= 16384; i <<= 1)
> > > +    {
> > > +      do_test4 (i, 0, i);
> > > +      do_test4 (i, 0, i - 1);
> > > +      do_test4 (i, 0, i + 1);
> > > +      do_test4 (i, 63, i + 63);
> > > +      do_test4 (i, 63, i + 64);
> > > +      do_test4 (i, 63, i);
> > > +
> > > +      do_test4 (i, 0, 1);
> > > +      do_test4 (i, 0, 15);
> > > +      do_test4 (i, 0, 31);
> > > +      do_test4 (i, 0, 63);
> > > +      do_test4 (i, 0, 64);
> > > +      do_test4 (i, 0, 65);
> > > +      do_test4 (i, 0, 127);
> > > +      do_test4 (i, 0, 129);
> > > +    }
> > > +
> > >
> > >    return ret;
> > >  }
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
>
> Thanks. Pushed the patchset.
>

We need to increase its timeout.  On a loaded machine, I got

[hjl@gnu-skx-1 build-x86_64-linux]$ cat string/test-memcpy.out
                        builtin_memcpy simple_memcpy
__memcpy_avx_unaligned __memcpy_avx_unaligned_erms
__memcpy_avx_unaligned_rtm __memcpy_avx_unaligned_erms_rtm
__memcpy_evex_unaligned __memcpy_evex_unaligned_erms
__memcpy_ssse3_back __memcpy_ssse3 __memcpy_avx512_no_vzeroupper
__memcpy_avx512_unaligned __memcpy_avx512_unaligned_erms
__memcpy_sse2_unaligned __memcpy_sse2_unaligned_erms __memcpy_erms
Timed out: killed the child process
Termination time: 2021-11-07T13:37:49.398927171
Last write to standard output: 2021-11-07T13:33:49.398122020
[hjl@gnu-skx-1 build-x86_64-linux]$

-- 
H.J.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
                     ` (4 preceding siblings ...)
  2021-11-06 19:12   ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
@ 2021-12-07 21:10   ` Stafford Horne
  2021-12-07 21:36     ` Noah Goldstein
  5 siblings, 1 reply; 46+ messages in thread
From: Stafford Horne @ 2021-12-07 21:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Sat, Nov 06, 2021 at 01:33:18PM -0500, Noah Goldstein via Libc-alpha wrote:
> This commit updates the memcpy tests to test both dst > src and dst <
> src. This is because there is logic in the code based on the
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
>  string/test-memmove.c |  75 ++++++++++++++++++-
>  2 files changed, 214 insertions(+), 28 deletions(-)
> 
> diff --git a/string/test-memcpy.c b/string/test-memcpy.c
[..]
>  static void
> -do_test1 (size_t size)
> +do_test1 (size_t align1, size_t align2, size_t size)
>  {
>    void *large_buf;
> -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> -		    MAP_PRIVATE | MAP_ANON, -1, 0);
> +  size_t mmap_size, region_size;
> +
> +  align1 &= (page_size - 1);
> +  if (align1 == 0)
> +    align1 = page_size;
> +
> +  align2 &= (page_size - 1);
> +  if (align2 == 0)
> +    align2 = page_size;
> +
> +  region_size = (size + page_size - 1) & (~(page_size - 1));
> +
> +  mmap_size = region_size * 2 + 3 * page_size;
> +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANON, -1, 0);
>    if (large_buf == MAP_FAILED)
>      {
> -      puts ("Failed to allocat large_buf, skipping do_test1");
> +      puts ("Failed to allocate large_buf, skipping do_test1");
>        return;
>      }
> -
> -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
>      error (EXIT_FAILURE, errno, "mprotect failed");
>  
> -  size_t arrary_size = size / sizeof (uint32_t);
> -  uint32_t *dest = large_buf;
> -  uint32_t *src = large_buf + size + page_size;
> +  size_t array_size = size / sizeof (uint32_t);
> +  uint32_t *dest = large_buf + align1;
> +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;

Hello, this causes Bus errors on the new OpenRISC port I am working on.

>    size_t i;
>    size_t repeats;
>    for(repeats = 0; repeats < 2; repeats++)
>      {
> -      for (i = 0; i < arrary_size; i++)
> +      for (i = 0; i < array_size; i++)
>          src[i] = (uint32_t) i;

The bus errors happen here caused when align2 is 1 or 2.  OpenRISC (and maybe
other architectures?) do not support unaligned copies of words.

I fixed this by limiting the align1/align2 to 4 but I am not sure if that is
what you are trying to copy here.

Maybe we need to change how we setup the src array.

> -
>        FOR_EACH_IMPL (impl, 0)
>          {
> -            printf ("\t\tRunning: %s\n", impl->name);
>            memset (dest, -1, size);
>            CALL (impl, (char *) dest, (char *) src, size);
> -          for (i = 0; i < arrary_size; i++)
> +          for (i = 0; i < array_size; i++)
>          if (dest[i] != src[i])
>            {
>              error (0, 0,
>                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
>                 impl->name, dest, src, i);
>              ret = 1;
> -            munmap ((void *) large_buf, size * 2 + page_size);
> +            munmap ((void *) large_buf, mmap_size);
>              return;
>            }
>          }
> -      dest = src;
> -      src = large_buf;
> +      dest = large_buf + region_size + 2 * page_size + align1;
> +      src = large_buf + align2;
> +    }
> +  munmap ((void *) large_buf, mmap_size);
> +}
> +
[..]
> @@ -306,8 +341,88 @@ test_main (void)
>  
>    do_random_tests ();
>  
> -  do_test1 (0x100000);
> -  do_test1 (0x2000000);
> +  do_test1 (0, 0, 0x100000);
> +  do_test1 (0, 0, 0x2000000);
> +
> +  for (i = 4096; i < 32768; i += 4096)
> +    {
> +      for (j = 1; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);

These +1, -1's cause non-aligned word access.

> +        }
> +    }
> +

-Stafford

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-12-07 21:10   ` Stafford Horne
@ 2021-12-07 21:36     ` Noah Goldstein
  2021-12-07 22:07       ` Stafford Horne
  0 siblings, 1 reply; 46+ messages in thread
From: Noah Goldstein @ 2021-12-07 21:36 UTC (permalink / raw)
  To: Stafford Horne; +Cc: GNU C Library

On Tue, Dec 7, 2021 at 3:10 PM Stafford Horne <shorne@gmail.com> wrote:
>
> On Sat, Nov 06, 2021 at 01:33:18PM -0500, Noah Goldstein via Libc-alpha wrote:
> > This commit updates the memcpy tests to test both dst > src and dst <
> > src. This is because there is logic in the code based on the
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
> >  string/test-memmove.c |  75 ++++++++++++++++++-
> >  2 files changed, 214 insertions(+), 28 deletions(-)
> >
> > diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> [..]
> >  static void
> > -do_test1 (size_t size)
> > +do_test1 (size_t align1, size_t align2, size_t size)
> >  {
> >    void *large_buf;
> > -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> > -                 MAP_PRIVATE | MAP_ANON, -1, 0);
> > +  size_t mmap_size, region_size;
> > +
> > +  align1 &= (page_size - 1);
> > +  if (align1 == 0)
> > +    align1 = page_size;
> > +
> > +  align2 &= (page_size - 1);
> > +  if (align2 == 0)
> > +    align2 = page_size;
> > +
> > +  region_size = (size + page_size - 1) & (~(page_size - 1));
> > +
> > +  mmap_size = region_size * 2 + 3 * page_size;
> > +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> > +                   MAP_PRIVATE | MAP_ANON, -1, 0);
> >    if (large_buf == MAP_FAILED)
> >      {
> > -      puts ("Failed to allocat large_buf, skipping do_test1");
> > +      puts ("Failed to allocate large_buf, skipping do_test1");
> >        return;
> >      }
> > -
> > -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> > +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
> >      error (EXIT_FAILURE, errno, "mprotect failed");
> >
> > -  size_t arrary_size = size / sizeof (uint32_t);
> > -  uint32_t *dest = large_buf;
> > -  uint32_t *src = large_buf + size + page_size;
> > +  size_t array_size = size / sizeof (uint32_t);
> > +  uint32_t *dest = large_buf + align1;
> > +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
>
> Hello, this causes Bus errors on the new OpenRISC port I am working on.

Bugzilla for this issue is here:
https://sourceware.org/bugzilla/show_bug.cgi?id=28572

There is a patch attached to the bugzilla and a patch proposed
by Adhemerval Zanella that both fix the issue.

>
> >    size_t i;
> >    size_t repeats;
> >    for(repeats = 0; repeats < 2; repeats++)
> >      {
> > -      for (i = 0; i < arrary_size; i++)
> > +      for (i = 0; i < array_size; i++)
> >          src[i] = (uint32_t) i;
>
> The bus errors happen here caused when align2 is 1 or 2.  OpenRISC (and maybe
> other architectures?) do not support unaligned copies of words.
>
> I fixed this by limiting the align1/align2 to 4 but I am not sure if that is
> what you are trying to copy here.
>
> Maybe we need to change how we setup the src array.
>
> > -
> >        FOR_EACH_IMPL (impl, 0)
> >          {
> > -            printf ("\t\tRunning: %s\n", impl->name);
> >            memset (dest, -1, size);
> >            CALL (impl, (char *) dest, (char *) src, size);
> > -          for (i = 0; i < arrary_size; i++)
> > +          for (i = 0; i < array_size; i++)
> >          if (dest[i] != src[i])
> >            {
> >              error (0, 0,
> >                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> >                 impl->name, dest, src, i);
> >              ret = 1;
> > -            munmap ((void *) large_buf, size * 2 + page_size);
> > +            munmap ((void *) large_buf, mmap_size);
> >              return;
> >            }
> >          }
> > -      dest = src;
> > -      src = large_buf;
> > +      dest = large_buf + region_size + 2 * page_size + align1;
> > +      src = large_buf + align2;
> > +    }
> > +  munmap ((void *) large_buf, mmap_size);
> > +}
> > +
> [..]
> > @@ -306,8 +341,88 @@ test_main (void)
> >
> >    do_random_tests ();
> >
> > -  do_test1 (0x100000);
> > -  do_test1 (0x2000000);
> > +  do_test1 (0, 0, 0x100000);
> > +  do_test1 (0, 0, 0x2000000);
> > +
> > +  for (i = 4096; i < 32768; i += 4096)
> > +    {
> > +      for (j = 1; j <= 1024; j <<= 1)
> > +        {
> > +          do_test1 (0, j, i);
> > +          do_test1 (4095, j, i);
> > +          do_test1 (4096 - j, 0, i);
> > +
> > +          do_test1 (0, j - 1, i);
> > +          do_test1 (4095, j - 1, i);
> > +          do_test1 (4096 - j - 1, 0, i);
> > +
> > +          do_test1 (0, j + 1, i);
> > +          do_test1 (4095, j + 1, i);
> > +          do_test1 (4096 - j, 1, i);
>
> These +1, -1's cause non-aligned word access.
>
> > +        }
> > +    }
> > +
>
> -Stafford

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-12-07 21:36     ` Noah Goldstein
@ 2021-12-07 22:07       ` Stafford Horne
  2021-12-07 22:13         ` Noah Goldstein
  0 siblings, 1 reply; 46+ messages in thread
From: Stafford Horne @ 2021-12-07 22:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Dec 07, 2021 at 03:36:48PM -0600, Noah Goldstein wrote:
> On Tue, Dec 7, 2021 at 3:10 PM Stafford Horne <shorne@gmail.com> wrote:
> >
> > On Sat, Nov 06, 2021 at 01:33:18PM -0500, Noah Goldstein via Libc-alpha wrote:
> > > This commit updates the memcpy tests to test both dst > src and dst <
> > > src. This is because there is logic in the code based on the
> > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > >  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
> > >  string/test-memmove.c |  75 ++++++++++++++++++-
> > >  2 files changed, 214 insertions(+), 28 deletions(-)
> > >
> > > diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> > [..]
> > >  static void
> > > -do_test1 (size_t size)
> > > +do_test1 (size_t align1, size_t align2, size_t size)
> > >  {
> > >    void *large_buf;
> > > -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> > > -                 MAP_PRIVATE | MAP_ANON, -1, 0);
> > > +  size_t mmap_size, region_size;
> > > +
> > > +  align1 &= (page_size - 1);
> > > +  if (align1 == 0)
> > > +    align1 = page_size;
> > > +
> > > +  align2 &= (page_size - 1);
> > > +  if (align2 == 0)
> > > +    align2 = page_size;
> > > +
> > > +  region_size = (size + page_size - 1) & (~(page_size - 1));
> > > +
> > > +  mmap_size = region_size * 2 + 3 * page_size;
> > > +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> > > +                   MAP_PRIVATE | MAP_ANON, -1, 0);
> > >    if (large_buf == MAP_FAILED)
> > >      {
> > > -      puts ("Failed to allocat large_buf, skipping do_test1");
> > > +      puts ("Failed to allocate large_buf, skipping do_test1");
> > >        return;
> > >      }
> > > -
> > > -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> > > +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
> > >      error (EXIT_FAILURE, errno, "mprotect failed");
> > >
> > > -  size_t arrary_size = size / sizeof (uint32_t);
> > > -  uint32_t *dest = large_buf;
> > > -  uint32_t *src = large_buf + size + page_size;
> > > +  size_t array_size = size / sizeof (uint32_t);
> > > +  uint32_t *dest = large_buf + align1;
> > > +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
> >
> > Hello, this causes Bus errors on the new OpenRISC port I am working on.
> 
> Bugzilla for this issue is here:
> https://sourceware.org/bugzilla/show_bug.cgi?id=28572
> 
> There is a patch attached to the bugzilla and a patch proposed
> by Adhemerval Zanella that both fix the issue.

Thanks,  That patch should work.

-Stafford

> >
> > >    size_t i;
> > >    size_t repeats;
> > >    for(repeats = 0; repeats < 2; repeats++)
> > >      {
> > > -      for (i = 0; i < arrary_size; i++)
> > > +      for (i = 0; i < array_size; i++)
> > >          src[i] = (uint32_t) i;
> >
> > The bus errors happen here caused when align2 is 1 or 2.  OpenRISC (and maybe
> > other architectures?) do not support unaligned copies of words.
> >
> > I fixed this by limiting the align1/align2 to 4 but I am not sure if that is
> > what you are trying to copy here.
> >
> > Maybe we need to change how we setup the src array.
> >
> > > -
> > >        FOR_EACH_IMPL (impl, 0)
> > >          {
> > > -            printf ("\t\tRunning: %s\n", impl->name);
> > >            memset (dest, -1, size);
> > >            CALL (impl, (char *) dest, (char *) src, size);
> > > -          for (i = 0; i < arrary_size; i++)
> > > +          for (i = 0; i < array_size; i++)
> > >          if (dest[i] != src[i])
> > >            {
> > >              error (0, 0,
> > >                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > >                 impl->name, dest, src, i);
> > >              ret = 1;
> > > -            munmap ((void *) large_buf, size * 2 + page_size);
> > > +            munmap ((void *) large_buf, mmap_size);
> > >              return;
> > >            }
> > >          }
> > > -      dest = src;
> > > -      src = large_buf;
> > > +      dest = large_buf + region_size + 2 * page_size + align1;
> > > +      src = large_buf + align2;
> > > +    }
> > > +  munmap ((void *) large_buf, mmap_size);
> > > +}
> > > +
> > [..]
> > > @@ -306,8 +341,88 @@ test_main (void)
> > >
> > >    do_random_tests ();
> > >
> > > -  do_test1 (0x100000);
> > > -  do_test1 (0x2000000);
> > > +  do_test1 (0, 0, 0x100000);
> > > +  do_test1 (0, 0, 0x2000000);
> > > +
> > > +  for (i = 4096; i < 32768; i += 4096)
> > > +    {
> > > +      for (j = 1; j <= 1024; j <<= 1)
> > > +        {
> > > +          do_test1 (0, j, i);
> > > +          do_test1 (4095, j, i);
> > > +          do_test1 (4096 - j, 0, i);
> > > +
> > > +          do_test1 (0, j - 1, i);
> > > +          do_test1 (4095, j - 1, i);
> > > +          do_test1 (4096 - j - 1, 0, i);
> > > +
> > > +          do_test1 (0, j + 1, i);
> > > +          do_test1 (4095, j + 1, i);
> > > +          do_test1 (4096 - j, 1, i);
> >
> > These +1, -1's cause non-aligned word access.
> >
> > > +        }
> > > +    }
> > > +
> >
> > -Stafford

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c
  2021-12-07 22:07       ` Stafford Horne
@ 2021-12-07 22:13         ` Noah Goldstein
  0 siblings, 0 replies; 46+ messages in thread
From: Noah Goldstein @ 2021-12-07 22:13 UTC (permalink / raw)
  To: Stafford Horne; +Cc: GNU C Library

On Tue, Dec 7, 2021 at 4:07 PM Stafford Horne <shorne@gmail.com> wrote:
>
> On Tue, Dec 07, 2021 at 03:36:48PM -0600, Noah Goldstein wrote:
> > On Tue, Dec 7, 2021 at 3:10 PM Stafford Horne <shorne@gmail.com> wrote:
> > >
> > > On Sat, Nov 06, 2021 at 01:33:18PM -0500, Noah Goldstein via Libc-alpha wrote:
> > > > This commit updates the memcpy tests to test both dst > src and dst <
> > > > src. This is because there is logic in the code based on the
> > > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > > > ---
> > > >  string/test-memcpy.c  | 167 +++++++++++++++++++++++++++++++++++-------
> > > >  string/test-memmove.c |  75 ++++++++++++++++++-
> > > >  2 files changed, 214 insertions(+), 28 deletions(-)
> > > >
> > > > diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> > > [..]
> > > >  static void
> > > > -do_test1 (size_t size)
> > > > +do_test1 (size_t align1, size_t align2, size_t size)
> > > >  {
> > > >    void *large_buf;
> > > > -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> > > > -                 MAP_PRIVATE | MAP_ANON, -1, 0);
> > > > +  size_t mmap_size, region_size;
> > > > +
> > > > +  align1 &= (page_size - 1);
> > > > +  if (align1 == 0)
> > > > +    align1 = page_size;
> > > > +
> > > > +  align2 &= (page_size - 1);
> > > > +  if (align2 == 0)
> > > > +    align2 = page_size;
> > > > +
> > > > +  region_size = (size + page_size - 1) & (~(page_size - 1));
> > > > +
> > > > +  mmap_size = region_size * 2 + 3 * page_size;
> > > > +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> > > > +                   MAP_PRIVATE | MAP_ANON, -1, 0);
> > > >    if (large_buf == MAP_FAILED)
> > > >      {
> > > > -      puts ("Failed to allocat large_buf, skipping do_test1");
> > > > +      puts ("Failed to allocate large_buf, skipping do_test1");
> > > >        return;
> > > >      }
> > > > -
> > > > -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> > > > +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
> > > >      error (EXIT_FAILURE, errno, "mprotect failed");
> > > >
> > > > -  size_t arrary_size = size / sizeof (uint32_t);
> > > > -  uint32_t *dest = large_buf;
> > > > -  uint32_t *src = large_buf + size + page_size;
> > > > +  size_t array_size = size / sizeof (uint32_t);
> > > > +  uint32_t *dest = large_buf + align1;
> > > > +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
> > >
> > > Hello, this causes Bus errors on the new OpenRISC port I am working on.
> >
> > Bugzilla for this issue is here:
> > https://sourceware.org/bugzilla/show_bug.cgi?id=28572
> >
> > There is a patch attached to the bugzilla and a patch proposed
> > by Adhemerval Zanella that both fix the issue.
>
> Thanks,  That patch should work.
>
> -Stafford

Just posted it.

>
> > >
> > > >    size_t i;
> > > >    size_t repeats;
> > > >    for(repeats = 0; repeats < 2; repeats++)
> > > >      {
> > > > -      for (i = 0; i < arrary_size; i++)
> > > > +      for (i = 0; i < array_size; i++)
> > > >          src[i] = (uint32_t) i;
> > >
> > > The bus errors happen here caused when align2 is 1 or 2.  OpenRISC (and maybe
> > > other architectures?) do not support unaligned copies of words.
> > >
> > > I fixed this by limiting the align1/align2 to 4 but I am not sure if that is
> > > what you are trying to copy here.
> > >
> > > Maybe we need to change how we setup the src array.
> > >
> > > > -
> > > >        FOR_EACH_IMPL (impl, 0)
> > > >          {
> > > > -            printf ("\t\tRunning: %s\n", impl->name);
> > > >            memset (dest, -1, size);
> > > >            CALL (impl, (char *) dest, (char *) src, size);
> > > > -          for (i = 0; i < arrary_size; i++)
> > > > +          for (i = 0; i < array_size; i++)
> > > >          if (dest[i] != src[i])
> > > >            {
> > > >              error (0, 0,
> > > >                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > > >                 impl->name, dest, src, i);
> > > >              ret = 1;
> > > > -            munmap ((void *) large_buf, size * 2 + page_size);
> > > > +            munmap ((void *) large_buf, mmap_size);
> > > >              return;
> > > >            }
> > > >          }
> > > > -      dest = src;
> > > > -      src = large_buf;
> > > > +      dest = large_buf + region_size + 2 * page_size + align1;
> > > > +      src = large_buf + align2;
> > > > +    }
> > > > +  munmap ((void *) large_buf, mmap_size);
> > > > +}
> > > > +
> > > [..]
> > > > @@ -306,8 +341,88 @@ test_main (void)
> > > >
> > > >    do_random_tests ();
> > > >
> > > > -  do_test1 (0x100000);
> > > > -  do_test1 (0x2000000);
> > > > +  do_test1 (0, 0, 0x100000);
> > > > +  do_test1 (0, 0, 0x2000000);
> > > > +
> > > > +  for (i = 4096; i < 32768; i += 4096)
> > > > +    {
> > > > +      for (j = 1; j <= 1024; j <<= 1)
> > > > +        {
> > > > +          do_test1 (0, j, i);
> > > > +          do_test1 (4095, j, i);
> > > > +          do_test1 (4096 - j, 0, i);
> > > > +
> > > > +          do_test1 (0, j - 1, i);
> > > > +          do_test1 (4095, j - 1, i);
> > > > +          do_test1 (4096 - j - 1, 0, i);
> > > > +
> > > > +          do_test1 (0, j + 1, i);
> > > > +          do_test1 (4095, j + 1, i);
> > > > +          do_test1 (4096 - j, 1, i);
> > >
> > > These +1, -1's cause non-aligned word access.
> > >
> > > > +        }
> > > > +    }
> > > > +
> > >
> > > -Stafford

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S
  2021-11-06 19:11     ` H.J. Lu
@ 2022-04-23  1:41       ` Sunil Pandey
  0 siblings, 0 replies; 46+ messages in thread
From: Sunil Pandey @ 2022-04-23  1:41 UTC (permalink / raw)
  To: H.J. Lu, libc-stable; +Cc: Noah Goldstein, GNU C Library

On Sat, Nov 6, 2021 at 12:12 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug.
> >
> > The optimizations are as follows:
> >
> > 1) Always align entry to 64 bytes. This makes behavior more
> >    predictable and makes other frontend optimizations easier.
> >
> > 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
> >    significant benefits in the case that:
> >         0 < (dst - src) < [256, 512]
> >
> > 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
> >    improvement and for FSRM [-10%, 25%].
> >
> > In addition to these primary changes there is general cleanup
> > throughout to optimize the aligning routines and control flow logic.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  sysdeps/x86_64/memmove.S                      |   2 +-
> >  .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
> >  .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
> >  .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
> >  .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
> >  .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
> >  6 files changed, 381 insertions(+), 224 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> > index db106a7a1f..b2b3180848 100644
> > --- a/sysdeps/x86_64/memmove.S
> > +++ b/sysdeps/x86_64/memmove.S
> > @@ -25,7 +25,7 @@
> >  /* Use movups and movaps for smaller code sizes.  */
> >  #define VMOVU          movups
> >  #define VMOVA          movaps
> > -
> > +#define MOV_SIZE       3
> >  #define SECTION(p)             p
> >
> >  #ifdef USE_MULTIARCH
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > index 1ec1962e86..67a55f0c85 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > @@ -4,7 +4,7 @@
> >  # define VMOVNT                vmovntdq
> >  # define VMOVU         vmovdqu
> >  # define VMOVA         vmovdqa
> > -
> > +# define MOV_SIZE      4
> >  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > index e195e93f15..975ae6c051 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > @@ -4,7 +4,7 @@
> >  # define VMOVNT                vmovntdq
> >  # define VMOVU         vmovdqu
> >  # define VMOVA         vmovdqa
> > -
> > +# define MOV_SIZE      4
> >  # define SECTION(p)            p##.avx
> >  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > index 848848ab39..0fa7126830 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > @@ -25,7 +25,7 @@
> >  # define VMOVU         vmovdqu64
> >  # define VMOVA         vmovdqa64
> >  # define VZEROUPPER
> > -
> > +# define MOV_SIZE      6
> >  # define SECTION(p)            p##.evex512
> >  # define MEMMOVE_SYMBOL(p,s)   p##_avx512_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > index 0cbce8f944..88715441fe 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > @@ -25,7 +25,7 @@
> >  # define VMOVU         vmovdqu64
> >  # define VMOVA         vmovdqa64
> >  # define VZEROUPPER
> > -
> > +# define MOV_SIZE      6
> >  # define SECTION(p)            p##.evex
> >  # define MEMMOVE_SYMBOL(p,s)   p##_evex_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index abde8438d4..7b27cbdda5 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -76,6 +76,25 @@
> >  # endif
> >  #endif
> >
> > +/* Whether to align before movsb. Ultimately we want 64 byte
> > +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> > +#define ALIGN_MOVSB    (VEC_SIZE > 16)
> > +/* Number of bytes to align movsb to.  */
> > +#define MOVSB_ALIGN_TO 64
> > +
> > +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> > +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> > +
> > +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> > +# error MOV_SIZE Unknown
> > +#endif
> > +
> > +#if LARGE_MOV_SIZE
> > +# define SMALL_SIZE_OFFSET     (4)
> > +#else
> > +# define SMALL_SIZE_OFFSET     (0)
> > +#endif
> > +
> >  #ifndef PAGE_SIZE
> >  # define PAGE_SIZE 4096
> >  #endif
> > @@ -199,25 +218,21 @@ L(start):
> >  # endif
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> > +       /* Load regardless.  */
> > +       VMOVU   (%rsi), %VEC(0)
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(more_2x_vec)
> > -#if !defined USE_MULTIARCH || !IS_IN (libc)
> > -L(last_2x_vec):
> > -#endif
> >         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > -       VMOVU   (%rsi), %VEC(0)
> >         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> >         VMOVU   %VEC(0), (%rdi)
> >         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > -#if !defined USE_MULTIARCH || !IS_IN (libc)
> > -L(nop):
> > -       ret
> > +#if !(defined USE_MULTIARCH && IS_IN (libc))
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> >  #else
> >         VZEROUPPER_RETURN
> >  #endif
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  END (MEMMOVE_SYMBOL (__memmove, unaligned))
> > -
> >  # if VEC_SIZE == 16
> >  ENTRY (__mempcpy_chk_erms)
> >         cmp     %RDX_LP, %RCX_LP
> > @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> >  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> >  # endif
> >
> > -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
> >         movq    %rdi, %rax
> >  L(start_erms):
> >  # ifdef __ILP32__
> > @@ -298,310 +313,448 @@ L(start_erms):
> >  # endif
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> > +       /* Load regardless.  */
> > +       VMOVU   (%rsi), %VEC(0)
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(movsb_more_2x_vec)
> > -L(last_2x_vec):
> > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> > +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> > +        */
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
> >         VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
> >  L(return):
> > -#if VEC_SIZE > 16
> > +# if VEC_SIZE > 16
> >         ZERO_UPPER_VEC_REGISTERS_RETURN
> > -#else
> > +# else
> >         ret
> > +# endif
> >  #endif
> >
> > -L(movsb):
> > -       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> > -       jae     L(more_8x_vec)
> > -       cmpq    %rsi, %rdi
> > -       jb      1f
> > -       /* Source == destination is less common.  */
> > -       je      L(nop)
> > -       leaq    (%rsi,%rdx), %r9
> > -       cmpq    %r9, %rdi
> > -       /* Avoid slow backward REP MOVSB.  */
> > -       jb      L(more_8x_vec_backward)
> > -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > -       jz      3f
> > -       movq    %rdi, %rcx
> > -       subq    %rsi, %rcx
> > -       jmp     2f
> > -# endif
> > -1:
> > -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > -       jz      3f
> > -       movq    %rsi, %rcx
> > -       subq    %rdi, %rcx
> > -2:
> > -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> > -   is N*4GB + [1..63] with N >= 0.  */
> > -       cmpl    $63, %ecx
> > -       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> > -3:
> > -# endif
> > -       mov     %RDX_LP, %RCX_LP
> > -       rep movsb
> > -L(nop):
> > +#if LARGE_MOV_SIZE
> > +       /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> > +          ENTRY block and L(less_vec).  */
> > +       .p2align 4,, 8
> > +L(between_4_7):
> > +       /* From 4 to 7.  No branch when size == 4.  */
> > +       movl    (%rsi), %ecx
> > +       movl    (%rsi, %rdx), %esi
> > +       movl    %ecx, (%rdi)
> > +       movl    %esi, (%rdi, %rdx)
> >         ret
> >  #endif
> >
> > +       .p2align 4
> >  L(less_vec):
> >         /* Less than 1 VEC.  */
> >  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> >  # error Unsupported VEC_SIZE!
> >  #endif
> >  #if VEC_SIZE > 32
> > -       cmpb    $32, %dl
> > +       cmpl    $32, %edx
> >         jae     L(between_32_63)
> >  #endif
> >  #if VEC_SIZE > 16
> > -       cmpb    $16, %dl
> > +       cmpl    $16, %edx
> >         jae     L(between_16_31)
> >  #endif
> > -       cmpb    $8, %dl
> > +       cmpl    $8, %edx
> >         jae     L(between_8_15)
> > -       cmpb    $4, %dl
> > +#if SMALL_MOV_SIZE
> > +       cmpl    $4, %edx
> > +#else
> > +       subq    $4, %rdx
> > +#endif
> >         jae     L(between_4_7)
> > -       cmpb    $1, %dl
> > -       ja      L(between_2_3)
> > -       jb      1f
> > -       movzbl  (%rsi), %ecx
> > +       cmpl    $(1 - SMALL_SIZE_OFFSET), %edx
> > +       jl      L(copy_0)
> > +       movb    (%rsi), %cl
> > +       je      L(copy_1)
> > +       movzwl  (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> > +       movw    %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> > +L(copy_1):
> >         movb    %cl, (%rdi)
> > -1:
> > +L(copy_0):
> >         ret
> > +
> > +#if SMALL_MOV_SIZE
> > +       .p2align 4,, 8
> > +L(between_4_7):
> > +       /* From 4 to 7.  No branch when size == 4.  */
> > +       movl    -4(%rsi, %rdx), %ecx
> > +       movl    (%rsi), %esi
> > +       movl    %ecx, -4(%rdi, %rdx)
> > +       movl    %esi, (%rdi)
> > +       ret
> > +#endif
> > +
> > +#if VEC_SIZE > 16
> > +       /* From 16 to 31.  No branch when size == 16.  */
> > +       .p2align 4,, 8
> > +L(between_16_31):
> > +       vmovdqu (%rsi), %xmm0
> > +       vmovdqu -16(%rsi, %rdx), %xmm1
> > +       vmovdqu %xmm0, (%rdi)
> > +       vmovdqu %xmm1, -16(%rdi, %rdx)
> > +       /* No ymm registers have been touched.  */
> > +       ret
> > +#endif
> > +
> >  #if VEC_SIZE > 32
> > +       .p2align 4,, 10
> >  L(between_32_63):
> >         /* From 32 to 63.  No branch when size == 32.  */
> >         VMOVU   (%rsi), %YMM0
> > -       VMOVU   -32(%rsi,%rdx), %YMM1
> > +       VMOVU   -32(%rsi, %rdx), %YMM1
> >         VMOVU   %YMM0, (%rdi)
> > -       VMOVU   %YMM1, -32(%rdi,%rdx)
> > -       VZEROUPPER_RETURN
> > -#endif
> > -#if VEC_SIZE > 16
> > -       /* From 16 to 31.  No branch when size == 16.  */
> > -L(between_16_31):
> > -       VMOVU   (%rsi), %XMM0
> > -       VMOVU   -16(%rsi,%rdx), %XMM1
> > -       VMOVU   %XMM0, (%rdi)
> > -       VMOVU   %XMM1, -16(%rdi,%rdx)
> > +       VMOVU   %YMM1, -32(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> > +
> > +       .p2align 4,, 10
> >  L(between_8_15):
> >         /* From 8 to 15.  No branch when size == 8.  */
> > -       movq    -8(%rsi,%rdx), %rcx
> > +       movq    -8(%rsi, %rdx), %rcx
> >         movq    (%rsi), %rsi
> > -       movq    %rcx, -8(%rdi,%rdx)
> >         movq    %rsi, (%rdi)
> > +       movq    %rcx, -8(%rdi, %rdx)
> >         ret
> > -L(between_4_7):
> > -       /* From 4 to 7.  No branch when size == 4.  */
> > -       movl    -4(%rsi,%rdx), %ecx
> > -       movl    (%rsi), %esi
> > -       movl    %ecx, -4(%rdi,%rdx)
> > -       movl    %esi, (%rdi)
> > -       ret
> > -L(between_2_3):
> > -       /* From 2 to 3.  No branch when size == 2.  */
> > -       movzwl  -2(%rsi,%rdx), %ecx
> > -       movzwl  (%rsi), %esi
> > -       movw    %cx, -2(%rdi,%rdx)
> > -       movw    %si, (%rdi)
> > -       ret
> >
> > +       .p2align 4,, 10
> > +L(last_4x_vec):
> > +       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> > +
> > +       /* VEC(0) and VEC(1) have already been loaded.  */
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> > +       VMOVU   %VEC(0), (%rdi)
> > +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > +       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  L(movsb_more_2x_vec):
> >         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> >         ja      L(movsb)
> >  #endif
> >  L(more_2x_vec):
> > -       /* More than 2 * VEC and there may be overlap between destination
> > -          and source.  */
> > +       /* More than 2 * VEC and there may be overlap between
> > +          destination and source.  */
> >         cmpq    $(VEC_SIZE * 8), %rdx
> >         ja      L(more_8x_vec)
> > +       /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> >         cmpq    $(VEC_SIZE * 4), %rdx
> >         jbe     L(last_4x_vec)
> > -       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
> >         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> >         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> >         VMOVU   %VEC(0), (%rdi)
> >         VMOVU   %VEC(1), VEC_SIZE(%rdi)
> >         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> >         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
> > -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> > -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> > -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> > -       VZEROUPPER_RETURN
> > -L(last_4x_vec):
> > -       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
> > -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> > +       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 4
> >  L(more_8x_vec):
> > +       movq    %rdi, %rcx
> > +       subq    %rsi, %rcx
> > +       /* Go to backwards temporal copy if overlap no matter what as
> > +          backward REP MOVSB is slow and we don't want to use NT stores if
> > +          there is overlap.  */
> > +       cmpq    %rdx, %rcx
> > +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> > +       jb      L(more_8x_vec_backward_check_nop)
> >         /* Check if non-temporal move candidate.  */
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> >         /* Check non-temporal store threshold.  */
> > -       cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> >         ja      L(large_memcpy_2x)
> >  #endif
> > -       /* Entry if rdx is greater than non-temporal threshold but there
> > -       is overlap.  */
> > +       /* To reach this point there cannot be overlap and dst > src. So
> > +          check for overlap and src > dst in which case correctness
> > +          requires forward copy. Otherwise decide between backward/forward
> > +          copy depending on address aliasing.  */
> > +
> > +       /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> > +          but less than __x86_shared_non_temporal_threshold.  */
> >  L(more_8x_vec_check):
> > -       cmpq    %rsi, %rdi
> > -       ja      L(more_8x_vec_backward)
> > -       /* Source == destination is less common.  */
> > -       je      L(nop)
> > -       /* Load the first VEC and last 4 * VEC to support overlapping
> > -          addresses.  */
> > -       VMOVU   (%rsi), %VEC(4)
> > +       /* rcx contains dst - src. Add back length (rdx).  */
> > +       leaq    (%rcx, %rdx), %r8
> > +       /* If r8 has different sign than rcx then there is overlap so we
> > +          must do forward copy.  */
> > +       xorq    %rcx, %r8
> > +       /* Isolate just sign bit of r8.  */
> > +       shrq    $63, %r8
> > +       /* Get 4k difference dst - src.  */
> > +       andl    $(PAGE_SIZE - 256), %ecx
> > +       /* If r8 is non-zero must do foward for correctness. Otherwise
> > +          if ecx is non-zero there is 4k False Alaising so do backward
> > +          copy.  */
> > +       addl    %r8d, %ecx
> > +       jz      L(more_8x_vec_backward)
> > +
> > +       /* if rdx is greater than __x86_shared_non_temporal_threshold
> > +          but there is overlap, or from short distance movsb.  */
> > +L(more_8x_vec_forward):
> > +       /* Load first and last 4 * VEC to support overlapping addresses.
> > +        */
> > +
> > +       /* First vec was already loaded into VEC(0).  */
> >         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
> >         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> > +       /* Save begining of dst.  */
> > +       movq    %rdi, %rcx
> > +       /* Align dst to VEC_SIZE - 1.  */
> > +       orq     $(VEC_SIZE - 1), %rdi
> >         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> >         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> > -       /* Save start and stop of the destination buffer.  */
> > -       movq    %rdi, %r11
> > -       leaq    -VEC_SIZE(%rdi, %rdx), %rcx
> > -       /* Align destination for aligned stores in the loop.  Compute
> > -          how much destination is misaligned.  */
> > -       movq    %rdi, %r8
> > -       andq    $(VEC_SIZE - 1), %r8
> > -       /* Get the negative of offset for alignment.  */
> > -       subq    $VEC_SIZE, %r8
> > -       /* Adjust source.  */
> > -       subq    %r8, %rsi
> > -       /* Adjust destination which should be aligned now.  */
> > -       subq    %r8, %rdi
> > -       /* Adjust length.  */
> > -       addq    %r8, %rdx
> >
> > -       .p2align 4
> > +       /* Subtract dst from src. Add back after dst aligned.  */
> > +       subq    %rcx, %rsi
> > +       /* Finish aligning dst.  */
> > +       incq    %rdi
> > +       /* Restore src adjusted with new value for aligned dst.  */
> > +       addq    %rdi, %rsi
> > +       /* Store end of buffer minus tail in rdx.  */
> > +       leaq    (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> > +
> > +       /* Dont use multi-byte nop to align.  */
> > +       .p2align 4,, 11
> >  L(loop_4x_vec_forward):
> >         /* Copy 4 * VEC a time forward.  */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +       VMOVU   (%rsi), %VEC(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
> >         subq    $-(VEC_SIZE * 4), %rsi
> > -       addq    $-(VEC_SIZE * 4), %rdx
> > -       VMOVA   %VEC(0), (%rdi)
> > -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVA   %VEC(1), (%rdi)
> > +       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> > +       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
> >         subq    $-(VEC_SIZE * 4), %rdi
> > -       cmpq    $(VEC_SIZE * 4), %rdx
> > +       cmpq    %rdi, %rdx
> >         ja      L(loop_4x_vec_forward)
> >         /* Store the last 4 * VEC.  */
> > -       VMOVU   %VEC(5), (%rcx)
> > -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > +       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> > +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> > +       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> > +       VMOVU   %VEC(8), (%rdx)
> >         /* Store the first VEC.  */
> > -       VMOVU   %VEC(4), (%r11)
> > +       VMOVU   %VEC(0), (%rcx)
> > +       /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> > +        */
> > +L(nop_backward):
> >         VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 8
> > +L(more_8x_vec_backward_check_nop):
> > +       /* rcx contains dst - src. Test for dst == src to skip all of
> > +          memmove.  */
> > +       testq   %rcx, %rcx
> > +       jz      L(nop_backward)
> >  L(more_8x_vec_backward):
> >         /* Load the first 4 * VEC and last VEC to support overlapping
> >            addresses.  */
> > -       VMOVU   (%rsi), %VEC(4)
> > +
> > +       /* First vec was also loaded into VEC(0).  */
> >         VMOVU   VEC_SIZE(%rsi), %VEC(5)
> >         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> > +       /* Begining of region for 4x backward copy stored in rcx.  */
> > +       leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> >         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
> > -       /* Save stop of the destination buffer.  */
> > -       leaq    -VEC_SIZE(%rdi, %rdx), %r11
> > -       /* Align destination end for aligned stores in the loop.  Compute
> > -          how much destination end is misaligned.  */
> > -       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
> > -       movq    %r11, %r9
> > -       movq    %r11, %r8
> > -       andq    $(VEC_SIZE - 1), %r8
> > -       /* Adjust source.  */
> > -       subq    %r8, %rcx
> > -       /* Adjust the end of destination which should be aligned now.  */
> > -       subq    %r8, %r9
> > -       /* Adjust length.  */
> > -       subq    %r8, %rdx
> > -
> > -       .p2align 4
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> > +       /* Subtract dst from src. Add back after dst aligned.  */
> > +       subq    %rdi, %rsi
> > +       /* Align dst.  */
> > +       andq    $-(VEC_SIZE), %rcx
> > +       /* Restore src.  */
> > +       addq    %rcx, %rsi
> > +
> > +       /* Don't use multi-byte nop to align.  */
> > +       .p2align 4,, 11
> >  L(loop_4x_vec_backward):
> >         /* Copy 4 * VEC a time backward.  */
> > -       VMOVU   (%rcx), %VEC(0)
> > -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > -       addq    $-(VEC_SIZE * 4), %rcx
> > -       addq    $-(VEC_SIZE * 4), %rdx
> > -       VMOVA   %VEC(0), (%r9)
> > -       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> > -       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> > -       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> > -       addq    $-(VEC_SIZE * 4), %r9
> > -       cmpq    $(VEC_SIZE * 4), %rdx
> > -       ja      L(loop_4x_vec_backward)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> > +       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> > +       addq    $(VEC_SIZE * -4), %rsi
> > +       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> > +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> > +       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> > +       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> > +       addq    $(VEC_SIZE * -4), %rcx
> > +       cmpq    %rcx, %rdi
> > +       jb      L(loop_4x_vec_backward)
> >         /* Store the first 4 * VEC.  */
> > -       VMOVU   %VEC(4), (%rdi)
> > +       VMOVU   %VEC(0), (%rdi)
> >         VMOVU   %VEC(5), VEC_SIZE(%rdi)
> >         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> >         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> >         /* Store the last VEC.  */
> > -       VMOVU   %VEC(8), (%r11)
> > +       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> > +       VZEROUPPER_RETURN
> > +
> > +#if defined USE_MULTIARCH && IS_IN (libc)
> > +       /* L(skip_short_movsb_check) is only used with ERMS. Not for
> > +          FSRM.  */
> > +       .p2align 5,, 16
> > +# if ALIGN_MOVSB
> > +L(skip_short_movsb_check):
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +#  endif
> > +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> > +#   error Unsupported MOVSB_ALIGN_TO
> > +#  endif
> > +       /* If CPU does not have FSRM two options for aligning. Align src
> > +          if dst and src 4k alias. Otherwise align dst.  */
> > +       testl   $(PAGE_SIZE - 512), %ecx
> > +       jnz     L(movsb_align_dst)
> > +       /* Fall through. dst and src 4k alias. It's better to align src
> > +          here because the bottleneck will be loads dues to the false
> > +          dependency on dst.  */
> > +
> > +       /* rcx already has dst - src.  */
> > +       movq    %rcx, %r9
> > +       /* Add src to len. Subtract back after src aligned. -1 because
> > +          src is initially aligned to MOVSB_ALIGN_TO - 1.  */
> > +       leaq    -1(%rsi, %rdx), %rcx
> > +       /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> > +       orq     $(MOVSB_ALIGN_TO - 1), %rsi
> > +       /* Restore dst and len adjusted with new values for aligned dst.
> > +        */
> > +       leaq    1(%rsi, %r9), %rdi
> > +       subq    %rsi, %rcx
> > +       /* Finish aligning src.  */
> > +       incq    %rsi
> > +
> > +       rep     movsb
> > +
> > +       VMOVU   %VEC(0), (%r8)
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> > +#  endif
> >         VZEROUPPER_RETURN
> > +# endif
> > +
> > +       .p2align 4,, 12
> > +L(movsb):
> > +       movq    %rdi, %rcx
> > +       subq    %rsi, %rcx
> > +       /* Go to backwards temporal copy if overlap no matter what as
> > +          backward REP MOVSB is slow and we don't want to use NT stores if
> > +          there is overlap.  */
> > +       cmpq    %rdx, %rcx
> > +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> > +       jb      L(more_8x_vec_backward_check_nop)
> > +# if ALIGN_MOVSB
> > +       /* Save dest for storing aligning VECs later.  */
> > +       movq    %rdi, %r8
> > +# endif
> > +       /* If above __x86_rep_movsb_stop_threshold most likely is
> > +          candidate for NT moves aswell.  */
> > +       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> > +       jae     L(large_memcpy_2x_check)
> > +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> > +       /* Only avoid short movsb if CPU has FSRM.  */
> > +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > +       jz      L(skip_short_movsb_check)
> > +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
> > +       /* Avoid "rep movsb" if RCX, the distance between source and
> > +          destination, is N*4GB + [1..63] with N >= 0.  */
> > +
> > +       /* ecx contains dst - src. Early check for backward copy
> > +          conditions means only case of slow movsb with src = dst + [0,
> > +          63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> > +          for that case.  */
> > +       cmpl    $-64, %ecx
> > +       ja      L(more_8x_vec_forward)
> > +#  endif
> > +# endif
> > +# if ALIGN_MOVSB
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +#  endif
> > +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> > +#   error Unsupported MOVSB_ALIGN_TO
> > +#  endif
> > +       /* Fall through means cpu has FSRM. In that case exclusively
> > +          align destination.  */
> > +L(movsb_align_dst):
> > +       /* Subtract dst from src. Add back after dst aligned.  */
> > +       subq    %rdi, %rsi
> > +       /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> > +       addq    $(MOVSB_ALIGN_TO - 1), %rdi
> > +       /* Add dst to len. Subtract back after dst aligned.  */
> > +       leaq    (%r8, %rdx), %rcx
> > +       /* Finish aligning dst.  */
> > +       andq    $-(MOVSB_ALIGN_TO), %rdi
> > +       /* Restore src and len adjusted with new values for aligned dst.
> > +        */
> > +       addq    %rdi, %rsi
> > +       subq    %rdi, %rcx
> > +
> > +       rep     movsb
> > +
> > +       /* Store VECs loaded for aligning.  */
> > +       VMOVU   %VEC(0), (%r8)
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> > +#  endif
> > +       VZEROUPPER_RETURN
> > +# else /* !ALIGN_MOVSB.  */
> > +L(skip_short_movsb_check):
> > +       mov     %RDX_LP, %RCX_LP
> > +       rep     movsb
> > +       ret
> > +# endif
> > +#endif
> >
> > +       .p2align 4,, 10
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -       .p2align 4
> > +L(large_memcpy_2x_check):
> > +       cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> > +       jb      L(more_8x_vec_check)
> >  L(large_memcpy_2x):
> > -       /* Compute absolute value of difference between source and
> > -          destination.  */
> > -       movq    %rdi, %r9
> > -       subq    %rsi, %r9
> > -       movq    %r9, %r8
> > -       leaq    -1(%r9), %rcx
> > -       sarq    $63, %r8
> > -       xorq    %r8, %r9
> > -       subq    %r8, %r9
> > -       /* Don't use non-temporal store if there is overlap between
> > -          destination and source since destination may be in cache when
> > -          source is loaded.  */
> > -       cmpq    %r9, %rdx
> > -       ja      L(more_8x_vec_check)
> > +       /* To reach this point it is impossible for dst > src and
> > +          overlap. Remaining to check is src > dst and overlap. rcx
> > +          already contains dst - src. Negate rcx to get src - dst. If
> > +          length > rcx then there is overlap and forward copy is best.  */
> > +       negq    %rcx
> > +       cmpq    %rcx, %rdx
> > +       ja      L(more_8x_vec_forward)
> >
> >         /* Cache align destination. First store the first 64 bytes then
> >            adjust alignments.  */
> > -       VMOVU   (%rsi), %VEC(8)
> > -#if VEC_SIZE < 64
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > -#if VEC_SIZE < 32
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > -#endif
> > -#endif
> > -       VMOVU   %VEC(8), (%rdi)
> > -#if VEC_SIZE < 64
> > -       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > -#if VEC_SIZE < 32
> > -       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > -       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > -#endif
> > -#endif
> > +
> > +       /* First vec was also loaded into VEC(0).  */
> > +# if VEC_SIZE < 64
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +#  if VEC_SIZE < 32
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +#  endif
> > +# endif
> > +       VMOVU   %VEC(0), (%rdi)
> > +# if VEC_SIZE < 64
> > +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > +#  if VEC_SIZE < 32
> > +       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +#  endif
> > +# endif
> > +
> >         /* Adjust source, destination, and size.  */
> >         movq    %rdi, %r8
> >         andq    $63, %r8
> > @@ -614,9 +767,13 @@ L(large_memcpy_2x):
> >         /* Adjust length.  */
> >         addq    %r8, %rdx
> >
> > -       /* Test if source and destination addresses will alias. If they do
> > -          the larger pipeline in large_memcpy_4x alleviated the
> > +       /* Test if source and destination addresses will alias. If they
> > +          do the larger pipeline in large_memcpy_4x alleviated the
> >            performance drop.  */
> > +
> > +       /* ecx contains -(dst - src). not ecx will return dst - src - 1
> > +          which works for testing aliasing.  */
> > +       notl    %ecx
> >         testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> >         jz      L(large_memcpy_4x)
> >
> > @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
> >         /* ecx stores inner loop counter.  */
> >         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> >  L(loop_large_memcpy_4x_inner):
> > -       /* Only one prefetch set per page as doing 4 pages give more time
> > -          for prefetcher to keep up.  */
> > +       /* Only one prefetch set per page as doing 4 pages give more
> > +          time for prefetcher to keep up.  */
> >         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
  2021-11-06 19:10     ` H.J. Lu
@ 2022-04-23  1:42       ` Sunil Pandey
  0 siblings, 0 replies; 46+ messages in thread
From: Sunil Pandey @ 2022-04-23  1:42 UTC (permalink / raw)
  To: H.J. Lu, libc-stable; +Cc: Noah Goldstein, GNU C Library

On Sat, Nov 6, 2021 at 12:11 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sat, Nov 6, 2021 at 11:36 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > No bug.
> >
> > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > benchmarks the vector copy loop, especially now that it handles 4k
> > aliasing, is better for these medium ranged.
> >
> > On Skylake with ERMS:
> >
> > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > 4096,   0,      0,      0,      0.975
> > 4096,   0,      0,      1,      0.953
> > 4096,   12,     0,      0,      0.969
> > 4096,   12,     0,      1,      0.872
> > 4096,   44,     0,      0,      0.979
> > 4096,   44,     0,      1,      0.83
> > 4096,   0,      12,     0,      1.006
> > 4096,   0,      12,     1,      0.989
> > 4096,   0,      44,     0,      0.739
> > 4096,   0,      44,     1,      0.942
> > 4096,   12,     12,     0,      1.009
> > 4096,   12,     12,     1,      0.973
> > 4096,   44,     44,     0,      0.791
> > 4096,   44,     44,     1,      0.961
> > 4096,   2048,   0,      0,      0.978
> > 4096,   2048,   0,      1,      0.951
> > 4096,   2060,   0,      0,      0.986
> > 4096,   2060,   0,      1,      0.963
> > 4096,   2048,   12,     0,      0.971
> > 4096,   2048,   12,     1,      0.941
> > 4096,   2060,   12,     0,      0.977
> > 4096,   2060,   12,     1,      0.949
> > 8192,   0,      0,      0,      0.85
> > 8192,   0,      0,      1,      0.845
> > 8192,   13,     0,      0,      0.937
> > 8192,   13,     0,      1,      0.939
> > 8192,   45,     0,      0,      0.932
> > 8192,   45,     0,      1,      0.927
> > 8192,   0,      13,     0,      0.621
> > 8192,   0,      13,     1,      0.62
> > 8192,   0,      45,     0,      0.53
> > 8192,   0,      45,     1,      0.516
> > 8192,   13,     13,     0,      0.664
> > 8192,   13,     13,     1,      0.659
> > 8192,   45,     45,     0,      0.593
> > 8192,   45,     45,     1,      0.575
> > 8192,   2048,   0,      0,      0.854
> > 8192,   2048,   0,      1,      0.834
> > 8192,   2061,   0,      0,      0.863
> > 8192,   2061,   0,      1,      0.857
> > 8192,   2048,   13,     0,      0.63
> > 8192,   2048,   13,     1,      0.629
> > 8192,   2061,   13,     0,      0.627
> > 8192,   2061,   13,     1,      0.62
> > ---
> >  sysdeps/x86/dl-cacheinfo.h   |  8 +++++---
> >  sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
> >  2 files changed, 20 insertions(+), 14 deletions(-)
> >
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index e6c94dfd02..2e43e67e4f 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> >    unsigned int minimum_rep_movsb_threshold;
> >  #endif
> > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
> > +     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
> > +     threshold is 2048 * (VEC_SIZE / 16).  */
> >    unsigned int rep_movsb_threshold;
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> >      {
> > -      rep_movsb_threshold = 2048 * (64 / 16);
> > +      rep_movsb_threshold = 4096 * (64 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 64 * 8;
> >  #endif
> > @@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> >                                     AVX_Fast_Unaligned_Load))
> >      {
> > -      rep_movsb_threshold = 2048 * (32 / 16);
> > +      rep_movsb_threshold = 4096 * (32 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 32 * 8;
> >  #endif
> > diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> > index dd6e1d65c9..419313804d 100644
> > --- a/sysdeps/x86/dl-tunables.list
> > +++ b/sysdeps/x86/dl-tunables.list
> > @@ -32,17 +32,21 @@ glibc {
> >      }
> >      x86_rep_movsb_threshold {
> >        type: SIZE_T
> > -      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
> > -      # isn't faster on short data.  The memcpy micro benchmark in glibc
> > -      # shows that 2KB is the approximate value above which REP MOVSB
> > -      # becomes faster than SSE2 optimization on processors with Enhanced
> > -      # REP MOVSB.  Since larger register size can move more data with a
> > -      # single load and store, the threshold is higher with larger register
> > -      # size.  Note: Since the REP MOVSB threshold must be greater than 8
> > -      # times of vector size and the default value is 2048 * (vector size
> > -      # / 16), the default value and the minimum value must be updated at
> > -      # run-time.  NB: Don't set the default value since we can't tell if
> > -      # the tunable value is set by user or not [BZ #27069].
> > +      # Since there is overhead to set up REP MOVSB operation, REP
> > +      # MOVSB isn't faster on short data.  The memcpy micro benchmark
> > +      # in glibc shows that 2KB is the approximate value above which
> > +      # REP MOVSB becomes faster than SSE2 optimization on processors
> > +      # with Enhanced REP MOVSB.  Since larger register size can move
> > +      # more data with a single load and store, the threshold is
> > +      # higher with larger register size.  Micro benchmarks show AVX
> > +      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
> > +      # threshold is extrapolated to 16KB.  For machines with FSRM the
> > +      # threshold is universally set at 2112 bytes.  Note: Since the
> > +      # REP MOVSB threshold must be greater than 8 times of vector
> > +      # size and the default value is 4096 * (vector size / 16), the
> > +      # default value and the minimum value must be updated at
> > +      # run-time.  NB: Don't set the default value since we can't tell
> > +      # if the tunable value is set by user or not [BZ #27069].
> >        minval: 1
> >      }
> >      x86_rep_stosb_threshold {
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2022-04-23  1:43 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-01  5:49 [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
2021-11-01  5:49 ` [PATCH v1 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
2021-11-06  2:27   ` H.J. Lu
2021-11-01  5:49 ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
2021-11-06  2:28   ` H.J. Lu
2021-11-01  5:49 ` [PATCH v1 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
2021-11-01  5:52   ` Noah Goldstein
2021-11-06  2:29   ` H.J. Lu
2021-11-01  5:49 ` [PATCH v1 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
2021-11-06  2:31   ` H.J. Lu
2021-11-06  4:39     ` Noah Goldstein
2021-11-06 12:04       ` H.J. Lu
2021-11-06 17:38         ` Noah Goldstein
2021-11-06  2:27 ` [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
2021-11-06  4:39 ` [PATCH v2 " Noah Goldstein
2021-11-06  4:39   ` [PATCH v2 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
2021-11-06  4:39   ` [PATCH v2 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
2021-11-06  4:39   ` [PATCH v2 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
2021-11-06  4:39   ` [PATCH v2 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
2021-11-06 17:37 ` [PATCH v3 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
2021-11-06 17:37   ` [PATCH v3 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
2021-11-06 17:37   ` [PATCH v3 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
2021-11-06 17:37   ` [PATCH v3 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
2021-11-06 17:37   ` [PATCH v3 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
2021-11-06 17:56     ` H.J. Lu
2021-11-06 18:11       ` Noah Goldstein
2021-11-06 18:21         ` H.J. Lu
2021-11-06 18:34           ` Noah Goldstein
2021-11-06 18:33 ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein
2021-11-06 18:33   ` [PATCH v4 2/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein
2021-11-06 19:12     ` H.J. Lu
2021-11-06 18:33   ` [PATCH v4 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein
2021-11-06 19:11     ` H.J. Lu
2021-11-06 18:33   ` [PATCH v4 4/5] x86: Optimize memmove-vec-unaligned-erms.S Noah Goldstein
2021-11-06 19:11     ` H.J. Lu
2022-04-23  1:41       ` Sunil Pandey
2021-11-06 18:33   ` [PATCH v4 5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h Noah Goldstein
2021-11-06 19:10     ` H.J. Lu
2022-04-23  1:42       ` Sunil Pandey
2021-11-06 19:12   ` [PATCH v4 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu
2021-11-06 21:20     ` Noah Goldstein
2021-11-07 13:53       ` H.J. Lu
2021-12-07 21:10   ` Stafford Horne
2021-12-07 21:36     ` Noah Goldstein
2021-12-07 22:07       ` Stafford Horne
2021-12-07 22:13         ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).